Klant: Useq

Project: Spatial Organ Atlas

Dataset: Kidney

date: 11 04 Thu 04 May, 2023

loading dependencies Please make sure the following packages are installed and required libraries can be loaded:

  • install.packages(“pkgbuild”) // pkgbuild::check_build_tools()
  • install.packages(“devtools”)
  • devtools::install_github(“Nanostring-Biostats/NanoStringNCTools”)
  • devtools::install_github(“Nanostring-Biostats/GeomxTools”, ref = “dev”)
  • BiocManager::install(“GeoMxWorkflows”)
  • devtools::install_github(“DavisLaboratory/standR”)
  • BiocManager::install(“SpatialDecon”)
  • BiocManager::install(“GSVA”)
  • install.packages(“plotly”)
  • install.packages(“DT”)
  • install.packages(“msigdbr”)
  • install.packages(“digest”)
  • install.packages(“rmarkdown”)
  • install.packages(“kable”)
  • BiocManager::install(“EBImage”)
  • install.packages(‘scattermore’)
  • install.packages(‘pbapply’)
  • install.packages(‘plotrix’)
  • install.packages(‘ggtext’)
  • install.packages(‘RBioFormats’)
  • BiocManager::install(“aoles/RBioFormats”)
  • install.packages(“C:/Users/inijman/Downloads/SpatialOmicsOverlay-0.99.13-beta.tar.gz”, dependencies = TRUE, repos = NULL)
  • devtools::install_github(“DavisLaboratory/standR”)
  • BiocManager::install(“clusterProfiler”)
  • BiocManager::install(“pathview”)
#load libraries

#stack_size <- getOption("pandoc.stack.size", default = "512m")
#options(java.parameters = c("-XX:+UseConcMarkSweepGC", "-Xmx8192m, -XX:MetaspaceSize=1024M"))
options(java.parameters = c("-XX:+UseConcMarkSweepGC", "-Xmx8192m"))
library(NanoStringNCTools)
library(GeomxTools)
library(GeoMxWorkflows)
library(SpatialDecon)
library(GSVA) #for pathway analyses
library(msigdbr) #for molecular signatures in pathway analyses
library(knitr)
library(dplyr)
library(ggforce)
library(ggplot2)
library(scales) # for percent
library(reshape2)  # for melt
library(cowplot)   # for plot_grid
library(umap)
library(Rtsne)
library(pheatmap)  # for pheatmap
library(ggrepel) 
library(scales) #for ggplot peaudolog to prevent errors on log(0)
library(DT)
library(plotly)
library(gridExtra)
library(RColorBrewer)
library(SpatialOmicsOverlay)
library(gt)
library(tidyverse)
library(clusterProfiler)
library(fgsea)
library(pathview)
library(png)
library(readxl)
library(viridis)

library(standR)
library(SpatialExperiment)
library(limma)
library(ggalluvial)
library(scater)

#BiocManager::install("sva")
library(sva)

#BiocManager::install("preprocessCore") #quantile norm
library(preprocessCore)

#install.packages("Polychrome") #Get colors
#library(Polychrome)


library(infercnv) # CNV
library("ape")
library("Biostrings")
library("ggtree")
library(tidytree)
library(ggpubr)

1 loading base files

# Reference the main folder 'file.path' containing the sub-folders with each data file type:
datadir<-file.path("L:/pkloosterman/Github/DKD_Kidney/")
#datadir<-file.path("E:/DKD_Kidnet/")

To locate a specific file path replace the above line with datadir <- file.path(“~/Folder/SubFolder/DataLocation”) replace the Folder, SubFolder, DataLocation as needed. The DataLocation folder should contain a dccs, pkcs, and annotation folder with each set of files present as needed automatically list files in each directory for use.

Take care you import a column with nuclei count separately if you want.

DCCFiles <- dir(file.path(datadir, "dccs"), pattern = ".dcc$",
                full.names = TRUE, recursive = TRUE)
PKCFiles <- dir(file.path(datadir, "pkcs"), pattern = ".pkc$",
                full.names = TRUE, recursive = TRUE)
SampleAnnotationFile <-
  dir(file.path(datadir, "annotation"), pattern = "^[^~]",
      full.names = TRUE, recursive = TRUE)

2 load data

Data <-
  readNanoStringGeoMxSet(dccFiles = DCCFiles,
                         pkcFiles = PKCFiles,
                         phenoDataFile = SampleAnnotationFile,
                         phenoDataSheet = "Template",
                         phenoDataDccColName = "Sample_ID",
                         protocolDataColNames = c("aoi", "roi"),
                         experimentDataColNames = c("panel"))

#save data to prevent loading time for retakes
#saveData<-Data
#Data<-saveData

#change Data column names and manual correction of spelling errors
Data@phenoData@data[["slide_name"]]<-Data@phenoData@data[["slide name"]]
Data@phenoData@data[["slide name"]]<-  NULL


#+1 references the slide name column
ann_size<-length(colnames(Data@phenoData@data)[grepl("ANN",colnames(Data@phenoData@data))])+1 
ann_names<-c(colnames(Data@phenoData@data)[grepl("ANN",colnames(Data@phenoData@data))],"slide_name")

# Feel free to change the order of which colors are appointed.
color<-c("#A349A4", "#FFFF33", "#E7298A", "#091833", "#1B9E77", "#D95F02", "#7570B3",  "#66A61E", "#E6AB02", "#8DD3C7", "#9F000F", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F", "#377EB8", "#984EA3", "#4DAF4A", "#FF71CE", "#FF7F00", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928")

# Use count and count_max to set the range of color needed for each column.
# With setNames() the color code is linked to each unique individual value of each column.
count=1
color_list = list()
for (ann in ann_names) {
  count_max = count+length(unique(Data@phenoData@data[[ann]]))-1
  color_list[[ann]]<-setNames(color[count:count_max], unique(Data@phenoData@data[[ann]]))
  count=count_max+1
}
value_names = c()
for (ann in ann_names) {
  value_names <- c(value_names, unique(Data@phenoData@data[[ann]]))
}

color_df <- as.data.frame(value_names)
color_df$color <- color[0:length(value_names)]

color_df %>% 
  mutate(color = fct_inorder(color)) |> 
  gt() %>% 
  data_color(columns = color, colors = as.character(color)) %>%
  tab_options(container.height = 500)
value_names color
DKD #A349A4
normal #FFFF33
disease3 #E7298A
disease4 #091833
normal3 #1B9E77
normal4 #D95F02
disease1B #7570B3
disease2B #66A61E
normal2B #E6AB02
glomerulus #8DD3C7
tubule #9F000F
abnormal #BEBADA
NA #FB8072
normal #80B1D3
niormal #FDB462
disease3 #B3DE69
disease4 #FCCDE5
normal3 #D9D9D9
normal4 #BC80BD
disease1B #CCEBC5
disease2B #FFED6F
normal2B #377EB8
paste("Reads from following runs used: ",unique(pData(protocolData(Data))$SeqSetId))
## [1] "Reads from following runs used:  "

3 Study design

pkcs <- annotation(Data)
modules <- gsub(".pkc", "", pkcs)
kable(data.frame(PKCs = pkcs, modules = modules))
PKCs modules
TAP_H_WTA_v1.0.pkc TAP_H_WTA_v1.0

Select the annotations we want to show, use `` to surround column names with spaces or special symbols

#count_mat <- dplyr::count(pData(Data), ANN1,ANN2,slide_name)
count_mat <- pData(Data) %>% dplyr::count(pData(Data)[c(ann_names)])

Simplify the slide names if required

# count_mat$slide_name <- gsub("disease", "d", gsub("normal", "n", count_mat$slide_name))
#count_mat$path_ann <- gsub("i", "", count_mat$path_ann) #correcting spelling error

Gather the data and plot in order: class, slide name, region, segment

test_gr <- gather_set_data(count_mat, 1:all_of(ann_size))
test_gr$x <- factor(test_gr$x, labels = ann_names)

Plot Sankey

ggplot(test_gr, height = 10, width = 10, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = ANN2), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "white", size = 5) +
  theme_classic(base_size = 12) + 
  theme(legend.position = "bottom",
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_blank()) +
  scale_y_continuous(expand = expansion(0)) + 
  scale_x_discrete(expand = expansion(0)) +
  labs(x = "", y = "") +
  scale_fill_manual(values=color_list$ANN2) +
  annotate(geom = "segment", x = 4.25, xend = 4.25,
           y = 10, yend = 61, lwd = 2) +
  annotate(geom = "text", x = 4.19, y = 25, angle = 90, size = 5,
           hjust = 0.5, label = "50 segments")

4 QC & Pre-processing

Shift counts to one

#shift any expression counts with a value of 0 to 1 to enable in downstream transformations.
Data <- shiftCountsOne(Data, useDALogic = TRUE)

4.1 Segment QC

We first assess sequencing quality and adequate tissue sampling for every ROI/AOI segment.

Every ROI/AOI segment will be tested for:

Raw sequencing reads: segments with >1000 raw reads are removed. % Aligned,% Trimmed, or % Stitched sequencing reads: segments below ~80% for one or more of these QC parameters are removed. % Sequencing saturation ([1-deduplicated reads/aligned reads]%): segments below ~50% require additional sequencing to capture full sample diversity and are not typically analyzed until improved. Negative Count: this is the geometric mean of the several unique negative probes in the GeoMx panel that do not target mRNA and establish the background count level per segment; segments with low negative counts (1-10) are not necessarily removed but may be studied closer for low endogenous gene signal and/or insufficient tissue sampling. No Template Control (NTC) count: values >1,000 could indicate contamination for the segments associated with this NTC; however, in cases where the NTC count is between 1,000- 10,000, the segments may be used if the NTC data is uniformly low (e.g. 0-2 counts for all probes). Nuclei: >100 nuclei per segment is generally recommended; however, this cutoff is highly study/tissue dependent and may need to be reduced; what is most important is consistency in the nuclei distribution for segments within the study. Area: generally correlates with nuclei; a strict cutoff is not generally applied based on area.

4.1.1 Select Segment QC

First, we select the QC parameter cutoffs, against which our ROI/AOI segments will be tested and flagged appropriately. We have selected the appropriate study-specific parameters for this study. Note: the default QC values recommended above are advised when surveying a new dataset for the first time.

Default QC cutoffs are commented in () adjacent to the respective parameters study-specific values were selected after visualizing the QC results in more detail below

QC_params <-
  list(minSegmentReads = 1000, # Minimum number of reads (1000)
       percentTrimmed = 80,    # Minimum % of reads trimmed (80%)
       percentStitched = 80,   # Minimum % of reads stitched (80%)
       percentAligned = 75,    # Minimum % of reads aligned (80%)
       percentSaturation = 50, # Minimum sequencing saturation (50%)
       minNegativeCount = 1,   # Minimum negative control counts (10)
       maxNTCCount = 9000,     # Maximum counts observed in NTC well (1000)
       minNuclei = 20,        # Minimum # of nuclei estimated (100)
       minArea = 1000)         # Minimum segment area (5000)
Data <-
  setSegmentQCFlags(Data, qcCutoffs = QC_params)        

cat("pre-QC features:", dim(Data)[1], "\npre-QC samples:", dim(Data)[2])
## pre-QC features: 18642 
## pre-QC samples: 276
#Table for clarification
QCparams_df <- data.frame (
  items = c("minSegmentReads","percentTrimmed","percentStitched","percentAligned","percentSaturation",
            "minNegativeCount","maxNTCCount","minNuclei","minArea"),
  defaults = c(1000,80,80,80,50,10,1000,100,5000),
  actual = c(QC_params$minSegmentReads,QC_params$percentTrimmed,QC_params$percentStitched,QC_params$percentAligned,QC_params$percentSaturation,QC_params$minNegativeCount,QC_params$maxNTCCount,QC_params$minNuclei,QC_params$minArea)
)

datatable(QCparams_df, rownames=FALSE,
          caption = "QC thresholds",
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )
)

Collate QC Results

QCResults <- protocolData(Data)[["QCFlags"]]
flag_columns <- colnames(QCResults)
QC_Summary <- data.frame(Pass = colSums(!QCResults[, flag_columns]),
                         Warning = colSums(QCResults[, flag_columns]))

QCResults$QCStatus <- apply(QCResults, 1L, function(x) {
  ifelse(sum(x) == 0L, "PASS", "WARNING")
})

QC_Summary["TOTAL FLAGS", ] <-
  c(sum(QCResults[, "QCStatus"] == "PASS"),
    sum(QCResults[, "QCStatus"] == "WARNING"))

col_by <- "ANN1"
col_by_plate <- "Plate_ID"

4.2 Graphical summaries of QC statistics

Use the tab-menu to navigate!

QC_histogram <- function(assay_data = NULL,
                         annotation = NULL,
                         fill_by = NULL,
                         thr = NULL,
                         scale_trans = NULL) {
  plt <- ggplot(assay_data,
                aes_string(x = paste0("unlist(`", annotation, "`)"),
                           fill = fill_by)) +
    geom_histogram(bins = 50) +
    geom_vline(xintercept = thr, lty = "dashed", color = "black") +
    theme_bw() + guides(fill = "none") +
    facet_wrap(as.formula(paste("~", fill_by)), nrow = 4) +
    scale_fill_manual(values=color_list$ANN1) +
    labs(x = annotation, y = "segments, #", title = annotation)
  if(!is.null(scale_trans)) {
    plt <- plt +
      scale_x_continuous(trans = scale_trans)
  }
  plt
}

Trimmed

% Trimmed sequencing reads: segments below the QC parameters are removed.

QC_histogram(sData(Data), "Trimmed (%)", col_by, QC_params$percentTrimmed)

Stiched (%)

% Stiched sequencing reads: segments below the QC parameters are removed.

QC_histogram(sData(Data), "Stitched (%)", col_by, QC_params$percentStitched)

Aligned (%)

% Aligned sequencing reads: segments below the QC parameters are removed.

QC_histogram(sData(Data), "Aligned (%)", col_by,QC_params$percentAligned)

Sequencing Saturation (%)

% Sequencing saturation ([1-deduplicated reads/aligned reads]%): segments below the QC parameter require additional sequencing to capture full sample diversity and are not typically analyzed until improved.

QC_histogram(sData(Data), "Saturated (%)", col_by, QC_params$percentSaturation) +
  labs(title = "Sequencing Saturation (%)",
       x = "Sequencing Saturation (%)")

Area

Area: generally correlates with nuclei; a strict cutoff is not generally applied based on area.

QC_histogram(sData(Data), "area", col_by, QC_params$minArea, scale_trans = "log10")

Nuclei count

Nuclei: >100 nuclei per segment is generally recommended; however, this cutoff is highly study/tissue dependent and may need to be reduced; what is most important is consistency in the nuclei distribution for segments within the study.

QC_histogram(sData(Data), "nuclei", col_by, QC_params$minNuclei)

DuplicationRate

Show the percentage of duplicated reads per raw reads.

ggplot(pData(protocolData(Data)),
       aes(x = Plate_ID, fill=Plate_ID,
          y = (DeduplicatedReads/Raw))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Deduplicated / Raw reads") +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()

Negprobes vs Endogenous

Compare Negprobes vs Endogenous expression to see if the endogenous (real) probes are significantly higher than Negprobes. This means that there is a high signal-to-noise ratio.

tmp_target_Data <- aggregateCounts(Data)

#get negative probe data
negs<-subset(tmp_target_Data,CodeClass=="Negative")

p1<-ggplot(pData(negs),
       aes(x = ANN2, fill = ANN2,
          y = assayDataElement(negs, elt = "exprs"))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Negative probes Expression") +
  scale_y_continuous(limits = c(1,3000), trans = "log2") +
  theme_bw() + coord_flip()


# get endogenous probe data
end<-subset(tmp_target_Data,CodeClass=="Endogenous")

p2<-ggplot(pData(end),
       aes(x = ANN2, fill = ANN2,
           y = colMeans(assayDataElement(end, elt = "exprs")))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Endogenous probes Expression (mean)") +
  scale_y_continuous(limits = c(1,3000),trans = "log2") +
  theme_bw() + coord_flip()

pl <-list(p1,p2)
plot_grid(plotlist=pl, nrow=2, align='v')

Neg_probe reads compared to raw_reads

Show the difference between deduplicated reads and negative control reads. It is expected that deduplicated reads are substantially higher than negative control reads.

# make background total neg probe count
fdata_df<-fData(Data)
negprobesnames<-rownames(fdata_df[fdata_df$Negative==TRUE,])
temp_exp<-assayDataElement(Data,elt='exprs')
negprobe_expr_fd<-temp_exp[rownames(temp_exp) %in% negprobesnames,]
tot_neg_ctrl_reads<-colSums(negprobe_expr_fd)
tot_dedup_reads<-pData(protocolData(Data))$DeduplicatedReads

df<-data.frame('aoi'= names(tot_neg_ctrl_reads),'tot_dedup_reads' = as.numeric(tot_dedup_reads),'tot_neg_ctrl_reads'=as.numeric(tot_neg_ctrl_reads))
df<-melt(df,id="aoi")
ggplot(df,aes(fill=variable,y=value,x=aoi)) + 
  geom_bar(position="identity",stat="identity") +
  scale_y_continuous(trans = log2_trans()) +
  theme(legend.position="bottom",axis.text.x = element_blank(),axis.ticks.x=element_blank())                     

Duplicated reads vs Background

Compare duplicated reads and background noise to show that the reads significantly duplicated and thus checked to avoid the questioning about the validity of the reads.

# get dcc per plate. sum negprobe counts/dcc/plate
ggplot(pData(protocolData(Data)),
       aes(x = Plate_ID, fill=Plate_ID,
          y = DeduplicatedReads)) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Deduplicated / Raw reads") +
  scale_y_log10()+
  geom_hline(data =pData(protocolData(Data)) , 
           aes(yintercept = NTC, colour=Plate_ID)) +
  theme_bw()

Duplicated reads vs ROIarea

See if all ROIs contain duplicated reads for validity.

temp_df<-cbind(pData(Data),pData(protocolData(Data)),dcc=rownames(pData(Data)))

ggplot(temp_df,
       aes(x = dcc, colour=slide_name,
          y = (DeduplicatedReads/area) )) +
  geom_point() +
  ylim(0,20) + 
  labs(y = "Deduplicated reads / ROI area") +
  theme(axis.text.x = element_text(size =6, angle=90, hjust=1) )

Duplicated reads vs nuclei

Show the amount of deduplicated reads per nuclei.

temp_df<-cbind(pData(Data),pData(protocolData(Data)),dcc=rownames(pData(Data)))

ggplot(temp_df,
       aes(x = dcc, colour=slide_name,
          y = (DeduplicatedReads/nuclei) )) +
  geom_point() +
  ylim(0,50) +                                                      # Adjust per project
  labs(y = "Deduplicated reads / nuclei") +
  theme(axis.text.x = element_text(size =6, angle=90, hjust=1) )

4.3 Process Negative GeoMeans

Negative Count: this is the geometric mean of the several unique negative probes in the GeoMx panel that do not target mRNA and establish the background count level per segment; segments with low negative counts (1-10) are not necessarily removed but may be studied closer for low endogenous gene signal and/or insufficient tissue sampling.

# Calculate the negative geometric means for each module
# It will show only the negative probes geomean, so expect less segments.
negativeGeoMeans <- 
  esBy(negativeControlSubset(Data), 
       GROUP = "Module", 
       FUN = function(x) { 
         assayDataApply(x, MARGIN = 2, FUN = ngeoMean, elt = "exprs") 
       }) 
protocolData(Data)[["NegGeoMean"]] <- negativeGeoMeans

negCols <- paste0("NegGeoMean_", modules)
pData(Data)[, negCols] <- sData(Data)[["NegGeoMean"]]
for(ann in negCols) {
  plt <- QC_histogram(pData(Data), ann, col_by, 2, scale_trans = "log10")
  print(plt)
}

# Detatch neg_geomean columns ahead of aggregateCounts call

pData(Data) <- pData(Data)[, !colnames(pData(Data)) %in% negCols]

Show all NTC values, Freq = # of Segments with a given NTC count:

QC<-sData(Data)

ntc_df <-QC[,c("slide_name","Plate_ID","NTC_ID","NTC")]
temptable<-ntc_df %>% dplyr::count(ntc_df$slide_name,ntc_df$NTC_ID,ntc_df$Plate_ID,ntc_df$NTC)
colnames(temptable) <- c("Slide_name","NTC_ID","Plate_ID","NTC_count","Number_of_samples")
datatable(temptable, rownames = FALSE)
kable(table(NTC_Count = sData(Data)$NTC), col.names = c("NTC Count", "# of Segments"))
NTC Count # of Segments
3 64
113 71
397 47
8704 94
kable(QC_Summary, caption = "QC Summary Table for each Segment")
QC Summary Table for each Segment
Pass Warning
LowReads 272 4
LowTrimmed 276 0
LowStitched 273 3
LowAligned 266 10
LowSaturation 272 4
LowNegatives 276 0
HighNTC 276 0
LowNuclei 276 0
LowArea 265 11
TOTAL FLAGS 259 17
datatable(QC_Summary,
          caption = "AOI QC Summary",
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )
)

Show AOIs which fail critical QCs.

QC<-sData(Data)
undersat<-subset(QC, `Saturated (%)`<= QC_params$percentSaturation)

if(nrow(undersat)> 0) {

datatable(aggregate(undersat, by=list(undersat$SampleID),paste,collapse=";"),
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )
)}

Show the reasons why the AOIs fail critical QCs.

if(nrow(undersat)> 0) {

datatable(aggregate(undersat$QCFlags, by=list(undersat$SampleID),paste,collapse=";"),
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )) %>%
    formatStyle(columns = colnames(undersat$QCFlags), backgroundColor = styleEqual("TRUE", "#850000"), color = styleEqual("TRUE", "white")) 
  }

Subsetting our dataset has removed samples which did not pass QC

Data <- Data[, QCResults$QCStatus == "PASS"]

Generally keep the qcCutoffs parameters unchanged. Set removeLocalOutliers to FALSE if you do not want to remove local outliers

Data <- setBioProbeQCFlags(Data, 
                               qcCutoffs = list(minProbeRatio = 0.1,
                                                percentFailGrubbs = 20), 
                               removeLocalOutliers = FALSE)

ProbeQCResults <- fData(Data)[["QCFlags"]]

Define QC table for Probe QC

qc_df <- data.frame(Passed = sum(rowSums(ProbeQCResults[, -1]) == 0),
                    Global = sum(ProbeQCResults$GlobalGrubbsOutlier),
                    Local = sum(rowSums(ProbeQCResults[, -2:-1]) > 0
                                & !ProbeQCResults$GlobalGrubbsOutlier))

Subset object to exclude all that did not pass Ratio & Global testing

ProbeQCPassed <- 
  subset(Data, 
         fData(Data)[["QCFlags"]][,c("LowProbeRatio")] == FALSE &
           fData(Data)[["QCFlags"]][,c("GlobalGrubbsOutlier")] == FALSE)

Data <- ProbeQCPassed 
cat("After QC features:", dim(Data)[1], "\nAfter QC samples:", dim(Data)[2])
## After QC features: 18641 
## After QC samples: 259

Check how many unique targets the object has

length(unique(featureData(Data)[["TargetName"]]))
## [1] 18504

Collapse to targets

target_Data <- aggregateCounts(Data)

exprs(target_Data)[1:5, 1:2]
##       DSP-1001250007851-H-A02.dcc DSP-1001250007851-H-A03.dcc
## A2M                           485                         262
## NAT2                           15                          18
## ACADM                          31                          15
## ACADS                          27                          17
## ACAT1                          29                          24

Define LOQ SD threshold and minimum value

cutoff <- 2
minLOQ <- 2

4.4 Limit of Quantification

We define a limit of quantification (LOQ) per ROI/AOI segment based on the negative control probes to guide the filtering of segments and genes with low signal relative to background. The formula for calculating the LOQ in the \(i^{th}\) segment at \(n\) standard deviations (\(n = 2\) for this study) is: \(LOQ_i=geomean(NegProbe_i)*geoSD(NegProbe_i)^n\)

Calculate LOQ per module tested

LOQ <- data.frame(row.names = colnames(target_Data))
for(module in modules) {
  vars <- paste0(c("NegGeoMean_", "NegGeoSD_"),
                 module)
  if(all(vars[1:2] %in% colnames(pData(target_Data)))) {
    LOQ[, module] <-
      pmax(minLOQ,
           pData(target_Data)[, vars[1]] * 
             pData(target_Data)[, vars[2]] ^ cutoff)
  }
}
pData(target_Data)$LOQ <- LOQ

4.5 Filtering

After determining the limit of quantification (LOQ) per segment, we recommend filtering out either segments and/or genes with abnormally low signal. Filtering is an important step to focus on the true biological data of interest.

We determine the number of genes detected in each segment across the dataset.

LOQ_Mat <- c()
for(module in modules) {
  ind <- fData(target_Data)$Module == module
  Mat_i <- t(esApply(target_Data[ind, ], MARGIN = 1,
                     FUN = function(x) {
                       x > LOQ[, module]
                     }))
  LOQ_Mat <- rbind(LOQ_Mat, Mat_i)
}
# ensure ordering since this is stored outside of the geomxSet
LOQ_Mat <- LOQ_Mat[fData(target_Data)$TargetName, ]

4.5.1 Segment Gene Detection

We first filter out segments with exceptionally low signal. These segments will have a small fraction of panel genes detected above the LOQ relative to the other segments in the study. Let’s visualize the distribution of segments with respect to their % genes detected:

Save detection rate information to pheno data

pData(target_Data)$GenesDetected <- 
  colSums(LOQ_Mat, na.rm = TRUE)
pData(target_Data)$GeneDetectionRate <-
  pData(target_Data)$GenesDetected / nrow(target_Data)

Determine detection thresholds: 1%, 5%, 10%, 15%, >15%

pData(target_Data)$DetectionThreshold <- 
  cut(pData(target_Data)$GeneDetectionRate,
      breaks = c(0, 0.01, 0.05, 0.1, 0.15, 0.2,1),
      labels = c("<1%", "1-5%", "5-10%", "10-15%", "15-20%", ">20%"))

# stacked bar plot of different cut points (1%, 5%, 10%, 15%)
ggplot(pData(target_Data),
       aes(x = DetectionThreshold)) +
  geom_bar(aes(fill = ANN2)) +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_bw() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  scale_fill_manual(values=color_list$ANN2) +
  labs(x = "Gene Detection Rate",
       y = "Segments, #",
       fill = "Segment Type")

cut percent genes detected at 1, 5, 10, 15

kable(table(pData(target_Data)$DetectionThreshold,
            pData(target_Data)$ANN2))
disease1B disease2B disease3 disease4 normal2B normal3 normal4
<1% 0 0 0 0 1 0 0
1-5% 0 1 0 0 6 0 0
5-10% 8 3 0 1 5 1 0
10-15% 13 11 1 3 2 3 0
15-20% 7 5 3 7 5 10 0
>20% 5 11 55 13 15 41 23
# set threshold for detectionlevel
# default 0.1
gene_det_threshold <- 0.05

target_Data <-
  target_Data[, pData(target_Data)$GeneDetectionRate >= gene_det_threshold]

dim(target_Data)
## Features  Samples 
##    18504      251

4.6 Manual removal of samples/classes

active_aois<-names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))

re-Collect annotations

# gather the data and plot in order: class, slide name, region, segment
#count_mat <- dplyr::count(pData(Data), ANN1,ANN2,ANN3,ANN4,slide_name)

temp_qc <- temp_df
temp_qc$QCResult <- QCResults$QCStatus
temp_qc$QCResult[temp_qc$QCResult == "WARNING"] <- "X"

#count_mat <- dplyr::count(a, ANN1, ANN2, slide_name, QCResult)
count_mat <- temp_qc %>% dplyr::count(temp_qc[c(ann_names, "QCResult")])

test_gr <- gather_set_data(count_mat, 1:(ann_size+1))
test_gr$x <- factor(test_gr$x, labels = c(ann_names, "QCResult"))

re-Plot Sankey

ggplot(test_gr, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = ANN2), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "white", size = 5) +
  theme_classic(base_size = 17) + 
  theme(legend.position = "bottom",
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_blank()) +
  scale_y_continuous(expand = expansion(0)) + 
  scale_x_discrete(expand = expansion(0)) +
  scale_fill_manual(values=color_list$ANN2) +
  labs(x = "", y = "") +
  annotate(geom = "segment", x = 3.25, xend = 3.25, y = 10, 
           yend = 60, lwd = 2) +
  annotate(geom = "text", x = 3.19, y = 25, angle = 90, size = 5,
           hjust = 0.5, label = "50 segments")

4.7 Gene Detection Rate

Calculate detection rate

LOQ_Mat <- LOQ_Mat[, colnames(target_Data)]
fData(target_Data)$DetectedSegments <- rowSums(LOQ_Mat, na.rm = TRUE)
fData(target_Data)$DetectionRate <-
  fData(target_Data)$DetectedSegments / nrow(pData(target_Data))

Gene of interest detection table

goi <- c("PDCD1", "CD274", "IFNG", "CD8A", "CD68", "EPCAM",
         "KRT18", "NPHS1", "NPHS2", "CALB1", "CLDN8")
goi_df <- data.frame(
  Gene = goi,
  Number = fData(target_Data)[goi, "DetectedSegments"],
  DetectionRate = percent(fData(target_Data)[goi, "DetectionRate"]))

4.8 Gene Filtering

We will graph the total number of genes detected in different percentages of segments. Based on the visualization below, we can better understand global gene detection in our study and select how many low detected genes to filter out of the dataset. Gene filtering increases performance of downstream statistical tests and improves interpretation of true biological signal.

Plot detection rate

plot_detect <- data.frame(Freq = c(1, 5, 10, 20, 30, 50))

# Manual gene cut-off
cutoff <- 5
cutoff <- which(sapply(plot_detect$Freq, FUN=function(X) cutoff %in% X)) + 0.5 # Select step from plot_detect. 1 = 1, 5 = 2, 10 = 3, etc. +0.5 to dodge bin.

plot_detect$Number <-
  unlist(lapply(c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5),
                function(x) {sum(fData(target_Data)$DetectionRate >= x)}))
plot_detect$Rate <- plot_detect$Number / nrow(fData(target_Data))
rownames(plot_detect) <- plot_detect$Freq

ggplot(plot_detect, aes(x = as.factor(Freq), y = Rate, fill = Rate)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = formatC(Number, format = "d", big.mark = ",")),
            vjust = 1.6, color = "black", size = 4) +
  
  geom_vline(xintercept = 3.5, color = "red") +
  geom_vline(xintercept = cutoff, color = "red") +
  annotate(x=3.5,y=0.1,label="Default cut-off",size=3,geom="label") +
  annotate(x=cutoff,y=0.1,label="Manual cut-off",size=3,geom="label") +
  
  scale_fill_gradient2(low = "orange2", mid = "lightblue",
                       high = "dodgerblue3", midpoint = 0.65,
                       limits = c(0,1),
                       labels = scales::percent) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0,1),
                     expand = expansion(mult = c(0, 0))) +
  labs(x = "% of Segments",
       y = "Genes Detected, % of Panel > LOQ")

Subset to target genes detected in at least a manual percentage of the samples with a default of 10%. Also manually include the negative control probe, for downstream use.

# default=0.1
negativeProbefData <- subset(fData(target_Data), CodeClass == "Negative")
neg_probes <- unique(negativeProbefData$TargetName)
target_Data <- 
  target_Data[fData(target_Data)$DetectionRate >= 0.05 |
                    fData(target_Data)$TargetName %in% neg_probes, ]

# retain only detected genes of interest
goi <- goi[goi %in% rownames(target_Data)]

dim(target_Data)
## Features  Samples 
##    12241      251

5 Normalization

We will now normalize the GeoMx data for downstream visualizations and differential expression. The two common methods for normalization of DSP-NGS RNA data are i) quartile 3 (Q3) or ii) background normalization.

Both of these normalization methods estimate a normalization factor per segment to bring the segment data distributions together. More advanced methods for normalization and modeling are under active development. However, for most studies, these methods are sufficient for understanding differences between biological classes of segments and samples.

Q3 normalization is typically the preferred normalization strategy for most DSP-NGS RNA studies. Given the low negative probe counts in this particular dataset as shown during Segment QC, we would further avoid background normalization as it may be less stable.

Before normalization, we will explore the relationship between the upper quartile (Q3) of the counts in each segment with the geometric mean of the negative control probes in the data. Ideally, there should be a separation between these two values to ensure we have stable measure of Q3 signal. If you do not see sufficient separation between these values, you may consider more aggressive filtering of low signal segments/genes.

Q3 and quantile normalization are radically different in their approach. Q3 normalization relies a normalization factor that aligns the third quartile gene count value for all samples. This method does not address potential global variations in data distributions or expression ranges. In contrast, quantile normalization, which is widely used to normalize gene expression data derived from microarrays, aligns count values according to the rank of the genes and forces the data of all samples into the same distribution. It is therefore inherent to this method that the difference in count value range or signal-to-noise ratio is no longer dependent on raw data distributions1.

note that quantile normalization, due to its course method of normalization, may limit the detection of more subtle differences in gene expression.

Graph Q3 value vs negGeoMean of Negatives

ann_of_interest <- "ANN2"
Stat_data <- 
  data.frame(row.names = colnames(exprs(target_Data)),
             Segment = colnames(exprs(target_Data)),
             Annotation = pData(target_Data)[, ann_of_interest],
             Q3 = unlist(apply(exprs(target_Data), 2,
                               quantile, 0.75, na.rm = TRUE)),
             NegProbe = exprs(target_Data)[neg_probes, ])
Stat_data_m <- melt(Stat_data, measure.vars = c("Q3", "NegProbe"),
                    variable.name = "Statistic", value.name = "Value")

plt1 <- ggplot(Stat_data_m,
               aes(x = Value, fill = Statistic)) +
  geom_histogram(bins = 40) + theme_bw() +
  scale_x_continuous(trans = "log2") +
  facet_wrap(~Annotation, nrow = 1) + 
  scale_fill_brewer(palette = 3, type = "qual") +
  labs(x = "Counts", y = "Segments, #")

plt2 <- ggplot(Stat_data,
               aes(x = NegProbe, y = Q3, color = Annotation)) +
  geom_abline(intercept = 0, slope = 1, lty = "dashed", color = "darkgray") +
  geom_point() + guides(color = "none") + theme_bw() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2") +
  theme(aspect.ratio = 1) +
  labs(x = "Negative Probe GeoMean, Counts", y = "Q3 Value, Counts")

plt3 <- ggplot(Stat_data,
               aes(x = NegProbe, y = Q3 / NegProbe, color = Annotation)) +
  geom_hline(yintercept = 1, lty = "dashed", color = "darkgray") +
  geom_point() + theme_bw() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2") +
  theme(aspect.ratio = 1) +
  labs(x = "Negative Probe GeoMean, Counts", y = "Q3/NegProbe Value, Counts")

btm_row <- plot_grid(plt2, plt3, nrow = 1, labels = c("B", ""),
                     rel_widths = c(0.43,0.57))
plot_grid(plt1, btm_row, ncol = 1, labels = c("A", ""))

Q3 norm (75th percentile) for WTA/CTA with or without custom spike-ins

target_Data <- normalize(target_Data ,
                             norm_method = "quant", 
                             desiredQuantile = .75,
                             toElt = "q_norm")
#, data_type = "RNA" depricated after 4.1

Quantile Normalization

quantile <- normalize.quantiles(target_Data@assayData[["exprs"]])
rownames(quantile) <- rownames(target_Data@assayData[["exprs"]])
colnames(quantile) <- colnames(target_Data@assayData[["exprs"]])

assayDataElement(object = target_Data, elt = "quantile_norm") <-  quantile

Background normalization for WTA/CTA without custom spike-in

target_Data <- normalize(target_Data,
                             norm_method = "neg", 
                             fromElt = "exprs",
                             toElt = "neg_norm")

# , data_type = "RNA" depricated after 4.1

Overwrite the dafault q_norm with another normalization method if needed

# Normalization options are: default "q_norm", "neg_norm" or "quantile_norm".
assayDataElement(object = target_Data, elt = "q_norm") <- assayDataElement(object = target_Data, elt = "q_norm") 

5.1 Visualize the first 10 segments with each normalization method

Use the tab-menu to navigate!

#Fix zero values, which go to -inf in log transform in standard boxplot
# temp <-as.matrix(exprs((target_Data)[,1:10]))
# long <- melt(temp)
# colnames(long) <- c("gene","segment","count")
# ggplot(long, aes(x=segment,y=count)) +
#     geom_boxplot(fill="#9EDAE5") +
#     scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
#     scale_x_discrete(labels=c(1:10)) +
#     labs(title="Raw counts", x="segment", y = "Counts, Raw")
# 
# 
# temp <-as.matrix(assayDataElement(target_Data[,1:10], elt = "q_norm"))
# long <- melt(temp)
# colnames(long) <- c("gene","segment","count")
# ggplot(long, aes(x=segment,y=count)) +
#     geom_boxplot(fill = "#2CA02C") +
#     scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
#     scale_x_discrete(labels=c(1:10)) +
#     labs(title="Q3 Norm Counts", x="segment", y = "Counts, Q3 Normalized")
# 
# 
# temp <-as.matrix(assayDataElement(target_Data[,1:10], elt = "neg_norm"))
# long <- melt(temp)
# colnames(long) <- c("gene","segment","count")
# ggplot(long, aes(x=segment,y=count)) +
#     geom_boxplot(fill = "#FF7F0E") +
#     scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
#     scale_x_discrete(labels=c(1:10)) +
#     labs(title="Neg Norm Counts", x="segment", y = "Counts, Neg. Normalized")

raw counts

boxplot(exprs(target_Data)[,1:8],
        col = "#9EDAE5", main = "Raw Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Raw")

Q3 normalized

Q3 (3rd quartile of all selected targets) is the recommended normalization method for NGS data for all targets that are above the limit of quantitation (LOQ). Q3 normalization uses the top 25% of expressers to normalize across ROIs/segments, so it is robust to changes in expression of individual genes and ideal for making comparisons across ROIs/segments.

boxplot(assayDataElement(target_Data[,1:8], elt = "q_norm"),
        col = "#2CA02C", main = "Q3 Norm Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Q3 Normalized")

Quantile normalized (testing phase)

The quantile normalization involves first ranking the gene of each sample by magnitude, calculating the average value for genes occupying the same rank, and then substituting the values of all genes occupying that particular rank with this average value. The next step is to reorder the genes of each sample in their original order2.

boxplot(assayDataElement(target_Data[,1:8], elt = "quantile_norm"),
        col = "pink", main = "Quantile Norm Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Quantile Normalized")

Negative probe normalization

For each sample, an RNA-probe-pool-specific negative probe normalization factor was generated on the basis of the geometric mean of negative probes in each pool.

boxplot(assayDataElement(target_Data[,1:8], elt = "neg_norm"),
        col = "#FF7F0E", main = "Neg Norm Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Neg. Normalized")

6 Unsupervised Analysis

6.1 UMAP

Use the tab-menu to navigate!

1

custom_umap <- umap::umap.defaults
custom_umap$random_state <- 42
# run UMAP

umap_out <-
  umap(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
       config = custom_umap)
pData(target_Data)[, c("UMAP1", "UMAP2")] <- umap_out$layout[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = slide_name, shape = ANN1)) +

  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()

2

ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = ANN3, shape = ANN1)) +

  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()

6.2 Run tSNE

Use the tab-menu to navigate!

One common approach to understanding high-plex data is dimension reduction. Two common methods are UMAP and tSNE, which are non-orthogonally constrained projections that cluster samples based on overall gene expression. In this study, we see by either UMAP (from the umap package) or tSNE (from the Rtsne package)

1

tsne_out <-
  Rtsne(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
        perplexity = ncol(target_Data)*.15)
pData(target_Data)[, c("tSNE1", "tSNE2")] <- tsne_out$Y[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = tSNE1, y = tSNE2, shape = slide_name, color = ANN2)) +
  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()

2

tsne_out <-
  Rtsne(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
        perplexity = ncol(target_Data)*.15)
pData(target_Data)[, c("tSNE1", "tSNE2")] <- tsne_out$Y[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = tSNE1, y = tSNE2, color = ANN3, shape = ANN1)) +
  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()

If no bias is observed set batch correction needed on FALSE, otherwise TRUE.

batch_correction_needed = FALSE

6.3 Batch correction

If the data is observed to have a batch effect it can be corrected with the methods: RUV4, LIMMA, or Combat-Seq. Each method is done and the best one is picked and used.

The standR package is short for Spatial transcriptomics analyzes and decoding in R, it aims at providing good practice pipeline and useful functions for users to analyze Nanostring’s GeoMx DSP data. In the Nanostring’s GeoMX DSP protocol, due to the fact that one slide is only big enough for a handful of tissue segments (ROIs), it is common that we see the DSP data being confounded by the batch effect introduced by different slides. In order to establish fair comparison between ROIs later on, it is necessary to remove this batch effect from the data. (https://bioconductor.org/packages/release/bioc/vignettes/standR/inst/doc/standR_introduction.html)

For RUV4 correction, the function is requiring 3 parameters other than the input object, including factors: the factor of interest, i.e. the biological variation we plan to keep; NCGs: the list of negative control genes detected using the function findNCGs; and k: is the number of unwanted factors to use, in the RUV documentation, it is suggest that we should use the smallest k once we don’t observe technical variation in the data.

Another option is set the parameter method to “Limma”, which uses the remove batch correction method from limma. In this mode, the function is requiring 2 parameters, including batch: a vector that indicating batches for all samples; and design: a design matrix which is generated by model.matrix, in the design matrix, all biologically-relevant factors should be included.

ComBat-seq is a batch effect adjustment tool for bulk RNA-seq count data. It is an improved model based on the popular ComBat, to address its limitations through novel methods designed specifically for RNA-Seq studies. ComBat-seq takes untransformed, raw count matrix as input. Same as ComBat, it requires a known batch variable. (https://github.com/zhangyuqing/ComBat-seq)

Set up standR object

count_geomx <- as.data.frame(target_Data@assayData[["exprs"]])

sample_geomx <- target_Data@phenoData@data
sample_geomx$roi <- target_Data@protocolData@data$roi
sample_geomx <- as.data.frame(sample_geomx)
sample_geomx$Sample_ID <- rownames(sample_geomx)
sample_geomx$SegmentDisplayName <- paste(sample_geomx$`scan name`, sample_geomx$roi, sample_geomx$segment, sep = " | ")
sample_geomx$ROICoordinateX <- 1
sample_geomx$ROICoordinateY <- 1


feature_geomx <- fData(Data)
feature_geomx <- feature_geomx[c("RTS_ID", "TargetName", "ProbeID", "Negative")]
feature_geomx <- as.data.frame(feature_geomx)
rownames(feature_geomx) <- NULL

matching <- sample_geomx$SegmentDisplayName[sample_geomx$Sample_ID %in% colnames(count_geomx)]
colnames(count_geomx) <- matching
count_geomx$TargetName <- rownames(count_geomx)
rownames(count_geomx) <- NULL

spe <- readGeoMx(count_geomx, sample_geomx, featureAnnoFile = feature_geomx, hasNegProbe = TRUE)

colData(spe)$regions <- paste0(colData(spe)$ANN2,"_",colData(spe)$ANN1) |> 
  (\(.) gsub("_Geometric Segment","",.))() |>
  paste0("_",colData(spe)$pathology) |>
  (\(.) gsub("_NA","_ns",.))()

colData(spe)$regions <- paste0(colData(spe)$ANN2,"_",colData(spe)$ANN1) |> 
  (\(.) gsub("_Geometric Segment","",.))() |>
  paste0("_",colData(spe)$pathology) |>
  (\(.) gsub("_NA","_ns",.))()
colData(spe)$biology <- paste0(colData(spe)$regions)

See optimal k value for RUV4

spe <- findNCGs(spe, batch_name = "slide_name", top_n = 500)
findBestK(spe, maxK = 10, factor_of_int = "biology", NCGs = metadata(spe)$NCGs, factor_batch = "slide_name")

RUV4

ruv4 <- geomxBatchCorrection(spe, factors = "biology", 
                   NCGs = metadata(spe)$NCGs, k = 1)

plotPairPCA(ruv4, assay = 2, color = slide_name, shape = regions, title = "RUV4 removeBatch")

plotRLExpr(ruv4, assay = 2, color = `slide_name`) + ggtitle("RUV4 removeBatch")

Limma

limma <- geomxBatchCorrection(spe,
                       batch = colData(spe)$`slide_name`, method = "Limma",
                       design = model.matrix(~ 0 + ANN1 + regions, 
                                             data = colData(spe)))

plotPairPCA(limma, assay = 2, color = slide_name, shape = regions, title = "Limma removeBatch")

plotRLExpr(limma, assay = 2, color = slide_name) + ggtitle("Limma removeBatch")

CombatSeq

adjusted <- ComBat_seq(target_Data@assayData[["exprs"]], batch=target_Data@phenoData@data[["slide_name"]], group=target_Data@phenoData@data[["ANN2"]])
assayDataElement(object = target_Data, elt = "combat") <-  adjusted

umap_out2 <-
  umap(t(log2(assayDataElement(target_Data , elt = "combat"))),
       config = custom_umap)
pData(target_Data)[, c("UMAP1", "UMAP2")] <- umap_out2$layout[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = slide_name, shape = ANN1)) +

  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw() + ggtitle("CombatSeq removeBatch")

Compare limma vs RUV4

spe_list <- list(spe, ruv4, limma)

plotClusterEvalStats(spe_list = spe_list,
                     bio_feature_name = "regions",
                     batch_feature_name = "slide_name",
                     data_names = c("Raw","RUV4","Limma"))

Add batch correction result to target_Data

neg_probes_save <- t(as.matrix(target_Data@assayData$q_norm["NegProbe-WTX",]))
rownames(neg_probes_save) <- "NegProbe-WTX"

# Depending on correct method, change the word "limma" to "ruv4" or vice versa.
limma <-limma@assays@data@listData[["logcounts"]]
limma <- rbind(limma, neg_probes_save)
colnames(limma) <- colnames(as.data.frame(target_Data@assayData[["q_norm"]]))
assayDataElement(object = target_Data, elt = "limma") <-  limma

ruv4 <-ruv4@assays@data@listData[["logcounts"]]
ruv4 <- rbind(ruv4, neg_probes_save)
colnames(ruv4) <- colnames(as.data.frame(target_Data@assayData[["q_norm"]]))
assayDataElement(object = target_Data, elt = "ruv4") <-  ruv4

Choose method

# choose method to replace q_norm or set it to ""
method <- ""

# replace q_norm with chosen method
if (!method == "") {
  # save orginal q_norm
  assayDataElement(object = target_Data, elt = "original") <-  assayDataElement(object = target_Data, elt = "q_norm")
  
  assayDataElement(object = target_Data, elt = "q_norm") <-  assayDataElement(object = target_Data, elt = method)
  print(paste("Batch correction method:", method))
} else {print("No batch correction needed")}
## [1] "No batch correction needed"

Umap after batch correction

umap_out <-
  umap(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
       config = custom_umap)
pData(target_Data)[, c("UMAP1", "UMAP2")] <- umap_out$layout[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = slide_name, shape = ANN1)) +
  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()

6.4 Clustering high CV Genes

Another approach to explore the data is to calculate the coefficient of variation (\(CV\)) for each gene (\(g\)) using the formula \(CV_g=SD_g/mean_g\). We then identify genes with high CVs that should have large differences across the various profiled segments. This unbiased approach can reveal highly variable genes across the study.

We plot the results using unsupervised hierarchical clustering, displayed as a heatmap.

# create a log2 transform of the data for analysis
assayDataElement(object = target_Data, elt = "log_q") <-
  assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")

# create CV function
calc_CV <- function(x) {sd(x) / mean(x)}
CV_dat <- assayDataApply(target_Data,
                         elt = "log_q", MARGIN = 1, calc_CV)
# show the highest CD genes and their CV values
sort(CV_dat, decreasing = TRUE)[1:5]
##   CAMK2N1    AKR1C1      AQP2       REN     GDF15 
## 0.6344192 0.5238995 0.4752667 0.4414027 0.4276390

Table of CV values

# show the highest CD genes and their CV values
datatable(as.data.frame(CV_dat),
          extensions = 'Buttons', options = list (
            order = list(1, 'desc'),
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          ), caption = "CV values of genes" 
) %>% formatRound(columns=c("CV_dat"), digits=3)

Heatmap genes in the top 3rd of the CV values

For heatmap clustering methods see: General heatmap info: https://www.rdocumentation.org/packages/pheatmap/versions/1.0.12/topics/pheatmap Distance methods further explained: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist clustering methods further explained: https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust

# choose clustering methods for heatmaps
clust_method <- "average"    # options: average, ward.D, ward.D2, single, complete, mcquitty, median and centroid.
# choose clustering distance methods for heatmaps
dist_method <- "correlation" # options: correlation, euclidean, maximum, manhattan, canberra, binary and minkowski.

GOI <- names(CV_dat)[CV_dat > quantile(CV_dat, 0.75)]

pheatmap(assayDataElement(target_Data[GOI, ], elt = "log_q"),
         scale = "row",
         cutree_cols = 3,
         cutree_rows = 2,
         show_rownames = FALSE, show_colnames = TRUE,
         border_color = NA,
         drop_levels = TRUE,
         clustering_method = clust_method,
         clustering_distance_rows = dist_method,
         clustering_distance_cols = dist_method,
         breaks = seq(-3, 3, 0.05),
         color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])

assayDataElement(object = target_Data, elt = "log_q") <-  assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")
log_q <-as.data.frame(assayDataElement(target_Data, elt= "log_q"))
#batch <-as.data.frame(assayDataElement(target_Data, elt= "batch"))

6.5.0 Create subset of data

If no subset is needed set subset_needed on FALSE, otherwise TRUE.

# determine AOIs to use
subset_needed <- FALSE

#active_aois<-rownames(ann)[ann$patient=="p4"]
active_aois<- names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))

6.5.1 Clustering high CV genes for subset

Calculating CV values

# create a log2 transform of the data for analysis
assayDataElement(object = target_Data, elt = "log_q") <-
  assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")

# create CV function
calc_CV <- function(x) {sd(x) / mean(x)}
CV_dat <- assayDataApply(target_Data[,active_aois],
                         elt = "log_q", MARGIN = 1, calc_CV)

Table of CV values

# show the highest CD genes and their CV values
datatable(as.data.frame(CV_dat),
          extensions = 'Buttons', options = list (
            order = list(1, 'desc'),
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          ), caption = "CV values of genes" 
) %>% formatRound(columns=c("CV_dat"), digits=3)

Heatmap on of subset, genes in the top 3rd of the CV values

# choose clustering methods for heatmaps
clust_method <- "average"    # options: average, ward.D, ward.D2, single, complete, mcquitty, median and centroid.
# choose clustering distance methods for heatmaps
dist_method <- "correlation" # options: correlation, euclidean, maximum, manhattan, canberra, binary and minkowski.

# Identify genes in the top 3rd of the CV values
GOI <- names(CV_dat)[CV_dat > quantile(CV_dat, 0.75)]
pheatmap(assayDataElement(target_Data[GOI,active_aois ], elt = "log_q"),
        scale = "row",
        fontsize_row = 5,
        cutree_cols = 3,
        cutree_rows = 2,
        show_rownames = FALSE, show_colnames = TRUE,
        border_color = NA,
        clustering_method = clust_method,
        clustering_distance_rows = dist_method,
        clustering_distance_cols = dist_method,
        breaks = seq(-3, 3, 0.05),
        color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
       annotation_colors = color_list,
        annotation_col =
          pData(target_Data)[, ann_names])

7.1 Differential Expression

t-test

#gc()
plots<-list()
tables<-list()
labels<-list()
test<-"ttest"
mtc<-"BY"
#options: "holm"       "hochberg"   "hommel"     "bonferroni" "BH"         "BY"         "fdr" 
counter=1

comps_df<-data.frame(comp='',val='')

for(region in unique(pData(target_Data)$ANN3)) {
  for (active_group1 in unique(pData(target_Data)$ANN1)) {
    for (active_group2 in unique(pData(target_Data)$ANN1)) {
      
      #supress reduncant compares
      if(active_group1==active_group2) {next}
      comp<-paste(sort(c(region, active_group1,active_group2)),collapse = "_")
      if(comp %in% comps_df$comp) {next}
      temp_df<-data.frame(comp=comp ,val=1)
      comps_df<-rbind(comps_df,temp_df)
      
      labels[[counter]]<-paste(active_group1," vs ", active_group2)
      group1<-log_q[,names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))[pData(target_Data)$ANN1==active_group1 & pData(target_Data)$ANN3==region]]
      group2<-log_q[,names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))[pData(target_Data)$ANN1==active_group2 & pData(target_Data)$ANN3==region]]
      
      #run t_tests  
      results<-as.data.frame ( apply(log_q, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) )
      colnames(results)<-"raw_p_value"
      
      #multiple_testing_correction
      adj_p_value<- p.adjust(results$raw_p_value,method=mtc)
      results<-cbind(results,adj_p_value)
      
      #calc_fdr
      FDR<- p.adjust(results$raw_p_value,method="fdr")
      results<-cbind(results,FDR)
      
      #fold_changes
      #as base data is already log transformed, means need to be subtracted to get FC in log space
      fchanges<-as.data.frame(apply(log_q, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]))))
      colnames(fchanges)<-"FC"
      #paste("FC",active_group1," / ",active_group2)
      results<-cbind(results,fchanges)
      
      #add genenames
      results$Gene<-rownames(results)
      
      #set categories based on P-value & FDR for plotting
      results$Color <- "NS or FC < 1"
      results$Color[results$raw_p_value < 0.05] <- "P < 0.05"
      results$Color[results$FDR < 0.05] <- "FDR < 0.05"
      results$Color[results$FDR < 0.001] <- "FDR < 0.001"
      results$Color[abs(results$FC) <1] <- "NS or FC < 1"
      results$Color <- factor(results$Color,
                              levels = c("NS or FC < 1", "P < 0.05", "FDR < 0.05", "FDR < 0.001"))
      
      #vulcanoplot
      
      # pick top genes for either side of volcano to label
      # order genes for convenience:
      
      results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC)
      top_g <- c()
      top_g <- c(top_g,
                 results[, 'Gene'][
                   order(results[, 'invert_P'], decreasing = TRUE)[1:20]],
                 results[, 'Gene'][order(results[, 'invert_P'], decreasing = FALSE)[1:20]])
      top_g<- unique(top_g)
      results <- results[, -1*ncol(results)] # remove invert_P from matrix
      
      # Graph results
      
      plots[[counter]]<- ggplot(results,
                                      aes(x = FC, y = -log10(raw_p_value),
                                          color = Color, label = Gene)) +
        geom_vline(xintercept = c(1, -1), lty = "dashed") +
        geom_hline(yintercept = -log10(0.05), lty = "dashed") +
        geom_point() +
        labs(x = paste("Enriched in", active_group2," <- log2(FC) -> Enriched in", active_group1),
             y = "Significance, -log10(P)",
             color = "Significance") +
        scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                      `FDR < 0.05` = "lightblue",
                                      `P < 0.05` = "orange2",
                                      `NS or FC < 0.5` = "gray"),
                           guide = guide_legend(override.aes = list(size = 4))) +
        scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
        geom_text_repel(data = subset(results, (-1>FC| FC>1) & FDR < 0.05 & Gene %in% top_g),
                        point.padding = 0.15, color = "black", size=3.5,
                        min.segment.length = .1, box.padding = .2, lwd = 2,
                        max.overlaps = 50) +
        theme_bw(base_size = 10) +
        theme(legend.position = "bottom") +
        ggtitle(paste("class: ", region," - ", test, mtc,"multitest corr"))
      
      #store tables for display later
      tables[[counter]]<-results
      
      counter = counter+1
      #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
    }
  }
}
grid.arrange(grobs=plots,ncol=2)

#strangly does not appear in html output -> chucks are limited to one datatable per chuck. The htmltools are a way around this but does show them stacked in a box.
results_tables <- htmltools::tagList()
for (c in (2:counter-1)) {
  #Gene %in% GOI
  results_tables[[c]] <- datatable(subset(tables[[c]], Color == c("FDR < 0.001",  "P < 0.05")), 
           rownames=FALSE,
           extensions = c('Buttons'), options = list (
              dom = 'Bftrip',
              buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
            ), height = 550,
           caption = paste("DE results ", labels[[c]]),filter='top') %>% formatRound(columns=c("raw_p_value","adj_p_value","FDR","FC"), digits=3)
  #cat('\n\n<!-- -->\n\n')
}            
results_tables
print("") # for layout of next section
## [1] ""

7.2 DE analysis with LMM

A common statistical approach is to use a linear mixed-effect model (LMM). The LMM allows the user to account for the subsampling per tissue; in other words, we adjust for the fact that the multiple regions of interest placed per tissue section are not independent observations, as is the assumption with other traditional statistical tests. The formulation of the LMM model depends on the scientific question being asked.

Overall, there are two flavors of the LMM model when used with GeoMx data: i) with and ii) without random slope.

When comparing features that co-exist in a given tissue section, a random slope is included in the LMM model. When comparing features that are mutually exclusive in a given tissue section the LMM model does not require a random slope.

Mostly, we use patient/sample as Random Intercept when they are combined on slides.

# convert test variables to factors
pData(target_Data)$testClass <- factor(pData(target_Data)$ANN1, unique(target_Data$ANN1))
pData(target_Data)[["slide"]] <- factor(pData(target_Data)[["slide_name"]])
#assayDataElement(object = target_Data, elt = "log_q") <- assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")

# run LMM:
# formula follows conventions defined by the lme4 package
lmm_results <- c()
for (region in unique(pData(target_Data)$ANN3)) {
    ind <- pData(target_Data)$ANN3 == region

    mixedOutmc <-
        mixedModelDE(target_Data[,ind],
                     elt = "log_q",
                     modelFormula = ~ testClass + (1 | slide),
                     groupVar = "testClass",
                     nCores = parallel::detectCores(),
                     multiCore = TRUE)

    # format results as data.frame
    r_test <- do.call(rbind, mixedOutmc["lsmeans", ])
    tests <- rownames(r_test)
    r_test <- as.data.frame(r_test)
    r_test$Contrast <- tests

    # use lapply in case you have multiple levels of your test factor to
    # correctly associate gene name with it's row in the results table
    r_test$Gene <-
        unlist(lapply(colnames(mixedOutmc),
                      rep, nrow(mixedOutmc["lsmeans", ][[1]])))
    r_test$Subset <- region
    r_test$FDR <- p.adjust(r_test$`Pr(>|t|)`, method = "fdr")
    r_test <- r_test[, c("Gene", "Subset", "Contrast", "Estimate",
                         "Pr(>|t|)", "FDR")]
    
    lmm_results <- rbind(lmm_results, r_test)
}

7.3 Vulcanoplot + table of LMM

# Categorize Results based on P-value & FDR for plotting
fc_threshold = 1

lmm_results$Color <- paste("NS or FC < ",fc_threshold = 1)
lmm_results$Color[lmm_results$`Pr(>|t|)` < 0.05] <- "P < 0.05"
lmm_results$Color[lmm_results$FDR < 0.05] <- "FDR < 0.05"
lmm_results$Color[lmm_results$FDR < 0.001] <- "FDR < 0.001"
lmm_results$Color[abs(lmm_results$Estimate) < fc_threshold] <- paste("NS or FC < ",fc_threshold = 1)
lmm_results$Color <- factor(lmm_results$Color,
                        levels = c("NS or FC < 1", "P < 0.05",
                                   "FDR < 0.05", "FDR < 0.001"))
counter = 1
plots_lmm <- list()
for(c in unique(lmm_results$Subset) ) {
  lmm_results_slice = lmm_results[lmm_results$Subset == c,]

  # pick top genes for either side of volcano to label
  # order genes for convenience:
  lmm_results_slice$invert_P <- (-log10(lmm_results_slice$`Pr(>|t|)`)) * sign(lmm_results_slice$Estimate)
  
  #loop here over tested conditions if applicable
  top_g <- c()
  top_g <- c(top_g,
    lmm_results_slice[, 'Gene'][
      order(lmm_results_slice[, 'invert_P'], decreasing = TRUE)[1:15]],
    lmm_results_slice[, 'Gene'][
      order(lmm_results_slice[, 'invert_P'], decreasing = FALSE)[1:15]])

  top_g <- unique(top_g)
  
  lmm_results_slice <- lmm_results_slice[, -1*ncol(lmm_results_slice)] # remove invert_P from matrix
  
  # Flip Contrast
  contrast_lab <- as.character(lmm_results_slice$Contrast)
  contrast_lab <- strsplit(contrast_lab, "-")[[1]]
  contrast_lab <- paste(contrast_lab[2], "-", contrast_lab[1])
  
  # Graph results
  plots_lmm[[counter]] <- ggplot(lmm_results_slice,
               aes(x = Estimate, y = -log10(`Pr(>|t|)`),
                   color = Color, label = Gene)) +
          geom_vline(xintercept = c(fc_threshold, -fc_threshold), lty = "dashed") +
          geom_hline(yintercept = -log10(0.05), lty = "dashed") +
          geom_point() +
          labs(
            x = paste(contrast_lab, " log2(FC)"),
               y = "Significance, -log10(P)",
               color = "Significance") +
          scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                        `FDR < 0.05` = "lightblue",
                                        `P < 0.05` = "orange2",
                                        `NS or FC < 1` = "gray"),
                             guide = guide_legend(override.aes = list(size = 4))) +
          scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
          geom_text_repel(data = subset(lmm_results_slice,  `Pr(>|t|)` < 0.001 & abs(lmm_results_slice$Estimate) > fc_threshold & Gene %in% top_g),
                          point.padding = 0.15, color = "black",size=5,
                          min.segment.length = .1, box.padding = .2, lwd = 2,
                          max.overlaps = 50) +
          theme_bw(base_size = 16) +
          theme(legend.position = "bottom") +
          facet_wrap(~Subset, scales = "free_y")
  counter <- counter + 1
}
grid.arrange(grobs=plots_lmm,ncol=2)

#subset(lmm_results, Gene %in% GOI)
datatable(lmm_results, rownames = FALSE,
          extensions = 'Buttons', options = list (
              dom = 'Bftrip',
              buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
            ),
          caption = "DE results for Genes of Interest (>75% CV)",filter='top') %>% formatRound(columns=c("Estimate","Pr(>|t|)","FDR"), digits=3)

7.4 Plotting Genes of Interest

my_gois <-unique(subset(lmm_results, `FDR` < 0.001)$Gene)
tmp_tbl<-subset(lmm_results, Gene %in% my_gois)

my_gois <- c()
for (contrast in unique(tmp_tbl$Contrast)) { # gene of interest for all contrasts
#for (contrast in c("Jux_Glo - Pro_Tub")) {
  ind <- tmp_tbl$Contrast == contrast
  goi <- tmp_tbl[ind, "Gene"][order(tmp_tbl[ind, "Estimate"], decreasing = FALSE)[1:3]]
  my_gois <- c(my_gois, goi)
  goi <- tmp_tbl[ind, "Gene"][order(tmp_tbl[ind, "Estimate"], decreasing = TRUE)[1:3]]
  my_gois <- c(my_gois, goi)
}

if(nrow(tmp_tbl) > 1) { 
  datatable(tmp_tbl,rownames = FALSE,caption = "DE results for Genes of Interest ",filter='top') %>% formatRound(columns=c("Estimate","Pr(>|t|)","FDR"), digits=3)
 
for (my_goi in my_gois) {
# show expression for a single target
  a<-ggplot(pData(target_Data),
       aes(x = ANN2, fill = ANN2,
           y = as.numeric(assayDataElement(target_Data[my_goi, ], elt = "q_norm")))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = paste(my_goi,"Expression")) +
  scale_y_continuous(trans = "log2") +
  #facet_wrap(~ANN3, nrow=1) + theme_bw(base_size = 16) +
  theme_bw() + coord_flip()
  print(a)
}
}else{
  print("No significant lMM results to plot")
}

7.5 Heatmap of Significant Genes

In addition to generating individual gene box plots or volcano plots, we can again create a heatmap from our data. This time rather than utilizing CV to select genes, we can use the P-value or FDR values to select genes. Here, we plot all genes with an FDR < 0.001.

my_gois <-unique(subset(lmm_results, `FDR` < 0.001)$Gene)   # 100 to prevent long runtime

if(length(my_gois)<2) {
  print("No significant results to show")
 
}else{

pheatmap(log2(assayDataElement(target_Data[my_gois, ], elt = "q_norm")),
         scale = "row",
         show_rownames = TRUE, show_colnames = TRUE,
         border_color = NA,
         clustering_method = "average",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         cutree_cols = 3, cutree_rows = 2,
         breaks = seq(-3, 3, 0.05),
         color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])
}

8 Pathway Analysis

Pathway analysis enables exploration of different aggregate gene sets for our experimental questions. Each individual ROI/AOI segment is scored for every pathway of interest, which we can then use to investigate biological differences. We will perform analogous analyses as those outlined in the Differential Expression and Spatial Deconvolution sections of the report for gene set enrichment.

8.1 Scoring Gene Sets

Pathways and gene sets were defined from the Kegg Brite database. We use an R software package called Gene Set Variation Analysis to score each segment within our study. see https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html for options on collections. We use the KEGG and REACTOME.

#gc()
h_gene_sets = rbind(msigdbr(species = "human", subcategory = "CP:KEGG"),
                    msigdbr(species = "human", subcategory = "CP:REACTOME"))
#msigdbr(species = "human", subcategory = "CP:BIOCARTA")

msigdbr_list = split(x = h_gene_sets$gene_symbol, f = h_gene_sets$gs_name)

# prepare df for accurate merging back genes later
pw_gene_df<-data.frame(Pathway = names(msigdbr_list))
pw_gene_df$genes<-msigdbr_list
ssgsea_results <- GSVA::gsva(expr = assayDataElement(target_Data,
                            elt = "log_q"),
                            gset.idx.list = msigdbr_list,
                            method = "ssgsea",
                            min.sz = 5,
                            max.sz = 500,
                            verbose = FALSE)
## [1] "Calculating ranks..."
## [1] "Calculating absolute values from ranks..."
## [1] "Normalizing..."
geneSetObj <-
  NanoStringGeoMxSet(assayData = ssgsea_results,
                     phenoData = AnnotatedDataFrame(pData(target_Data)),
                     protocolData = protocolData(target_Data),
                     featureType = "GeneSet",
                     check = FALSE)

8.2 Differental analysis of pathways

# # convert test variables to factors
pData(target_Data)$testClass <- factor(pData(target_Data)$ANN1, unique(target_Data$ANN1))
pData(geneSetObj)[["slide"]]<-factor(pData(geneSetObj)[["slide_name"]])
#pData(target_Data)$testRegion<-factor(pData(target_Data)$ANN2, unique(count_mat$ANN3))

lmm_ssgsea_results <- c()
for (region in unique(pData(target_Data)$ANN3)) {
  ind <- pData(target_Data)$ANN3 == region
  mixedOutmc <-
    mixedModelDE(geneSetObj[, ind],
                 elt = "exprs",
                 modelFormula = ~ testClass + (1 | slide),
                 groupVar = "testClass",
                 nCores = parallel::detectCores(),
                 multiCore = TRUE)

  # format results as data.frame
  r_ssgsea_test <- do.call(rbind, mixedOutmc["lsmeans", ])
  ssgsea_tests <- rownames(r_ssgsea_test)
  r_ssgsea_test <- as.data.frame(r_ssgsea_test)
  r_ssgsea_test$Contrast <- ssgsea_tests
  #r_ssgsea_test$Genes <- msigdbr_list seems unreliable as gsva omits pathways if genes are not in data..

  # use lapply in case you have multiple levels of your test factor to
  # correctly associate gene name with it's row in the results table
  r_ssgsea_test$Pathway <-
    unlist(lapply(colnames(mixedOutmc),
                  rep, nrow(mixedOutmc["lsmeans", ][[1]])))

  #r_ssgsea_test$Subset <- status
  r_ssgsea_test$FDR <- p.adjust(r_ssgsea_test$`Pr(>|t|)`, method = "fdr")
  r_ssgsea_test$Subset <- region
  r_ssgsea_test <- r_ssgsea_test[, c("Pathway", "Subset", "Contrast", "Estimate",
                                     "Pr(>|t|)", "FDR")]
  lmm_ssgsea_results <- rbind(lmm_ssgsea_results, r_ssgsea_test)

  }
  lmm_ssgsea_results <- merge(lmm_ssgsea_results, pw_gene_df)

8.2.1 Table of Differental analysis of pathways

datatable(subset(lmm_ssgsea_results), rownames = FALSE,
          extensions = 'Buttons', options = list (
             pageLength = 10,
              dom = 'Bftrip',
              buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
            ),
          caption = "DE results for Pathways",filter='top') %>% formatRound(columns=c("Estimate","Pr(>|t|)","FDR"), digits=5)

8.3 Vulcanoplot of LMM_Pathways

# Categorize Results based on P-value & FDR for plotting
fc_threshold = 0.3

lmm_ssgsea_results$Color <- "NS or FC < 0.3"
lmm_ssgsea_results$Color[lmm_ssgsea_results$`Pr(>|t|)` < 0.05] <- "P < 0.05"
lmm_ssgsea_results$Color[lmm_ssgsea_results$FDR < 0.05] <- "FDR < 0.05"
lmm_ssgsea_results$Color[lmm_ssgsea_results$FDR < 0.001] <- "FDR < 0.001"
lmm_ssgsea_results$Color[abs(lmm_ssgsea_results$Estimate) < fc_threshold] <- "NS or FC < 0.3"
lmm_ssgsea_results$Color <- factor(lmm_ssgsea_results$Color,
                        levels = c("NS or FC < 0.3", "P < 0.05",
                                   "FDR < 0.05", "FDR < 0.001"))

# pick top pw for either side of volcano to label
# order pw for convenience:
lmm_ssgsea_results$invert_P <- (-log10(lmm_ssgsea_results$`Pr(>|t|)`)) * sign(lmm_ssgsea_results$Estimate)
top_ssgsea_g <- c()

#loop here over tested conditions if applicable

for(c in unique(lmm_ssgsea_results$Subset) ) {
  lmm_results_slice = lmm_ssgsea_results[lmm_ssgsea_results$Subset == c,]
 top_ssgsea_g <- c(top_ssgsea_g,
         lmm_ssgsea_results[ind, 'Pathway'][
               order(lmm_ssgsea_results[ind, 'invert_P'], decreasing = TRUE)[1:15]],
         lmm_ssgsea_results[ind, 'Pathway'][
               order(lmm_ssgsea_results[ind, 'invert_P'], decreasing = FALSE)[1:15]])
}
 
top_ssgsea_g <- unique(top_ssgsea_g)
lmm_ssgsea_results <- lmm_ssgsea_results[, -1*ncol(lmm_ssgsea_results)] # remove invert_P from matrix

#lmm_ssgsea_results_slice <- lmm_ssgsea_results_slice[lmm_ssgsea_results_slice$FDR < 1,]

# Graph results
ggplot(lmm_ssgsea_results,
       aes(x = Estimate, y = -log10(`Pr(>|t|)`),
           color = Color, label = Pathway)) +
    geom_vline(xintercept = c(0.2, -0.2), lty = "dashed") +
    geom_hline(yintercept = -log10(0.05), lty = "dashed") +
    geom_point() +
    labs(x = paste(lmm_results_slice$Contrast, "log2(FC)"),
         y = "Significance, -log10(P)",
         color = "Significance") +
    scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                  `FDR < 0.05` = "lightblue",
                                  `P < 0.05` = "orange2",
                                  `NS or FC < 0.5` = "gray"),
                       guide = guide_legend(override.aes = list(size = 4))) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
    geom_text_repel(data = subset(lmm_ssgsea_results, Color == "FDR < 0.05" | Color == "FDR < 0.001"),
                   point.padding = 0.15, color = "black",size=3,
                   min.segment.length = .1, box.padding = .2, lwd = 2,
                   max.overlaps = 50) +
    theme_bw(base_size = 16) +
    theme(legend.position = "bottom") +
    facet_wrap(~Subset, scales = "free_y")

#    +facet_wrap(~Subset, scales = "free_y"))

8.4 heatmap of pathways

  active_pw<-filter(lmm_ssgsea_results, `Pr(>|t|)` < 0.001)$Pathway
  active_pw<-filter(lmm_ssgsea_results, FDR < 0.001 )$Pathway
  #active_pw<-filter(lmm_ssgsea_results, Color == "FDR < 0.001")$Pathway
  
  
  active_pw<-top_ssgsea_g
  
  if (length(active_pw)>1) {
  
    print("go")
    
  pw_matrix<-assayDataElement(geneSetObj, elt = "exprs")

  pheatmap(pw_matrix[active_pw,],
         scale = "row",
         show_rownames = TRUE, show_colnames = TRUE,
         fontsize_row = 10,
         border_color = NA,
         clustering_method = "average",
         #clustering_distance_rows = "correlation",
         clustering_distance_cols = "euclidean",
         cutree_cols = 2, cutree_rows = 2,
         breaks = seq(-3, 3, 0.05),
         #color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
         main = "Heatmap of selected Pathways",
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])
  
  }else{
    print("No significant results to display")
  }
## [1] "go"

8.5 fgsea pathway analysis

Another option for pathway analysis is the R software package called Fast Gene Set Enrichment Analysis. It is a pre-ranked method that uses the LMM results combined with the same pathway list from msigdbr as GSVA.

fgsea_results_all <- c()
for (contrast in unique(lmm_results$Contrast)) {
    
    ranks <- lmm_results$Estimate[lmm_results$Contrast == contrast]
    names(ranks) <- lmm_results$Gene[lmm_results$Contrast == contrast]
    
    fgsea_results <- fgsea(msigdbr_list, 
                         ranks, 
                         eps = 0.0,
                         minSize=15, 
                         maxSize = 500)
    fgsea_results$Contrast <- contrast
    fgsea_results_all <- rbind(fgsea_results_all, fgsea_results)
    
}

fgsea_reactome <- fgsea_results_all[grep("REACTOME", pathway),]
fgsea_reactome$pathway <- gsub("REACTOME_", "", fgsea_reactome$pathway)
fgsea_reactome$pathway <- gsub("_", " ", fgsea_reactome$pathway)
# Categorize Results based on P-value & FDR for plotting
fc_threshold = 0.3


fgsea_results_all$Color <- "NS or FC < 0.3"
fgsea_results_all$Color[fgsea_results_all$pval < 0.05] <- "P < 0.05"
fgsea_results_all$Color[fgsea_results_all$padj < 0.05] <- "FDR < 0.05"
fgsea_results_all$Color[fgsea_results_all$padj < 0.001] <- "FDR < 0.001"
fgsea_results_all$Color[abs(fgsea_results_all$ES) < fc_threshold] <- "NS or FC < 0.3"
fgsea_results_all$Color <- factor(fgsea_results_all$Color,
                        levels = c("NS or FC < 0.3", "P < 0.05",
                                   "FDR < 0.05", "FDR < 0.001"))

plt <- htmltools::tagList()
counter = 1
for(c in unique(fgsea_results_all$Contrast) ) {
  fgsea_results_slice = fgsea_results_all[fgsea_results_all$Contrast == c,]

  # # pick top genes for either side of volcano to label
  # # order genes for convenience:
  fgsea_results_slice$invert_P <- (-log10(fgsea_results_slice$pval)) * sign(fgsea_results_slice$ES)
  
  # #loop here over tested conditions if applicable
  #top_fgsea_g <- c()
  top_fgsea_g <- c(fgsea_results_slice[, 'pathway'][
                        order(fgsea_results_slice[, 'invert_P'], decreasing = TRUE)[1:15]],
                    fgsea_results_slice[, 'pathway'][
                        order(fgsea_results_slice[, 'invert_P'], decreasing = FALSE)[1:15]])
   
  # for (celltype in unique(lmm_results$Subset)) {
  #      top_fgsea_g <- c(top_fgsea_g,
  #                  fgsea_results_slice[fgsea_results_slice$Subset==celltype, 'pathway'][
  #                      order(fgsea_results_slice[, 'invert_P'], decreasing = TRUE)[1:20]],
  #                  fgsea_results_slice[fgsea_results_slice$Subset==celltype, 'pathway'][
  #                      order(fgsea_results_slice[, 'invert_P'], decreasing = FALSE)[1:20]])
  # }
  
  top_fgsea_g <- unique(top_fgsea_g)
  #fgsea_results_slice <- fgsea_results_slice[, -1*ncol(fgsea_results_slice)] # remove invert_P from matrix
  
  # Graph results
  dynplot <- ggplot(fgsea_results_slice,
         aes(x = ES, y = -log10(pval),
             color = Color, label = pathway)) +
      geom_vline(xintercept = c(fc_threshold,-fc_threshold), lty = "dashed") +
      geom_hline(yintercept = -log10(0.05), lty = "dashed") +
      geom_point() +
      labs(x = paste(c, " FC"),
           y = "Significance, -log10(P)",
           color = "Significance") +
      scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                    `FDR < 0.05` = "lightblue",
                                    `P < 0.05` = "orange2",
                                    `NS or FC < 0.3` = "gray"),
                         guide = guide_legend(override.aes = list(size = 4))) +
      scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
      geom_text_repel(data = subset(fgsea_results_slice, Color == "P < 0.05" | Color == "FDR < 0.05" | Color == "FDR < 0.001"),
                     point.padding = 0.15, color = "black",size=5,
                     min.segment.length = .1, box.padding = .2, lwd = 2,
                     max.overlaps = 50) +
      theme_bw(base_size = 16) +
      theme(legend.position = "bottom") #+
      #facet_wrap(~Subset, scales = "free_y")
   
  plt[[counter]] <- as_widget(ggplotly(dynplot))
  counter <- counter + 1
  #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
}
plt

Barplot pathways

Clear vizualization of the top 15 positive and negative expressed Reactome pathways. Pathways with a adjusted p-value of 0.05 or higher will be presented as red and below 0.05 as green.

Fgsea returns results from the given databases. To avoid one database overshadowing the other, results have been split to show database specific results. Reactome is shown in barplots and KEGG as a pathway image including highlighted genes.

top_all <- c()
for (contrast in unique(fgsea_reactome$Contrast)) {
    # active_group1 <- contrast_list[contrast][[1]][[1]]
    # active_group2 <- contrast_list[contrast][[1]][[2]]
    # ind <- pData(target_Data)[pData(target_Data)$ANN2 == active_group1 | pData(target_Data)$ANN2 == active_group2]
    top <- fgsea_reactome[fgsea_reactome$Contrast == contrast]
    toppos <- top[order(top$NES, decreasing = TRUE)][0:15]
    topneg <- top[order(top$NES, decreasing = FALSE)][0:15]
    top <- rbind(toppos, topneg)
    
    top$Color[top$padj < 0.05] <- "padj < 0.05"
    top$Color[top$padj == 0.05 | top$padj > 0.05] <- "padj > 0.05"
    top_all <- rbind(top_all, top)
    barplot <- ggplot(top,aes(x=reorder(pathway, NES),y=NES,fill=Color)) + 
      scale_x_discrete(labels = function(x) str_wrap(x, width = 100)) + 
      geom_col(position="dodge") + scale_fill_manual(values = c("#2ECC71", "#E74C3C")) +
      ggtitle(paste("Top 15 + and - NES reactome pathways from", contrast)) + 
      xlab("pathway") + ylab("Normalized Enrichment Score") +
      coord_flip() + 
      theme(text = element_text(size = 20)) +
      theme(panel.background = element_blank())
      #facet_wrap(~Contrast, scales = "free_y")
    print(barplot)
}

8.6 Visualize KEGG pathway

Get pathway visualization from KEGG. clusterProfiler grabs the KEGG pathway IDs and combined with pathview it can download a pathway image from the KEGG database. The LMM results are given with the pathway id to visualize up and down regulated genes. It will pick the most significant pathway if not manually specified.

get_wp_gmtfile <- function() {
    wpurl <- 'https://wikipathways-data.wmcloud.org/current/gmt/'
    x <- readLines(wpurl)
    y <- x[grep('\\.gmt',x)]
    sub(".*(wikipathways-.*\\.gmt).*", "\\1",  y[grep('File', y)])
}

get_wp_data <- function(organism) {
    organism <- sub(" ", "_", organism)
    gmtfile <- get_wp_gmtfile()
    wpurl <- 'https://wikipathways-data.wmcloud.org/current/gmt/'
    url <- paste0(wpurl,
                  gmtfile[grep(organism, gmtfile)])
    f <- tempfile(fileext = ".gmt")
    dl <- mydownload(url, destfile = f)
    if (is.null(f)) {
        message("fail to download wikiPathways data...")
        return(NULL)
    }
    read.gmt.wp(f)
}
# enrichKEGG grab online pathways
gene <-  target_Data@featureData@data[["GeneID"]]
kk <- enrichKEGG(gene         = gene,
                 organism     = 'hsa',
                 pvalueCutoff = 0.1)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
enrichKEGG_results <- kk@result

# Match column values for link
# fsgea run
fgsea_results$Description <- gsub("KEGG_", "", fgsea_results$pathway)
fgsea_results$Description <- gsub("_", " ", fgsea_results$Description)
# ssgea run
# lmm_ssgsea_results_d$Description <- gsub("KEGG_", "", lmm_ssgsea_results_d$Pathway)
# lmm_ssgsea_results_d$Description <- gsub("_", " ", lmm_ssgsea_results_d$Description)

enrichKEGG_results$Description <- toupper(enrichKEGG_results$Description)

# Create df for matching KEGG results
KEGGIDs <- fgsea_results#[fgsea_results$Contrast == "Pro_Tub - Dis_Tub"]
KEGGIDs <- KEGGIDs %>% inner_join(enrichKEGG_results, by = 'Description') %>% select(Description, ID, geneID, leadingEdge, padj)

# Create geneList with DE results for visualization pathway genes 
genelist <- lmm_results$Estimate
names(genelist) <- target_Data@featureData@data[["GeneID"]]

# Choose manual or highest significant pathway
pathway_name <- "INSULIN SIGNALING PATHWAY" # manual pathway query
pathway_name <- toupper(pathway_name)
if (pathway_name == "") {
  pathway_name <- KEGGIDs[order(KEGGIDs$padj, decreasing = FALSE), ][1]$Description
}
pathwayid <- KEGGIDs$ID[KEGGIDs$Description == pathway_name]

# Grab pathway image with gene info, save and plot
viewPath <- pathview(gene.data  = genelist,
                     pathway.id = pathwayid,
                     species    = "hsa",
                     out.suffix = "genesinfo", kegg.native = T, same.layer = F)
## 'select()' returned 1:1 mapping between keys and columns
## Info: Working in directory C:/Users/pkloosterman/Documents/GitHub/GeoMX-analysis/Data/DKD_Kidney
## Info: Writing image file hsa04910.genesinfo.png
img <- readPNG(paste(pathwayid, ".genesinfo.png", sep = ""))
grid::grid.raster(img)

9 Spatial Deconvolution

9.1 Calculate backgrounds

#gc()
bg = derive_GeoMx_background(
  norm = assayDataElement(target_Data , elt = "q_norm"),
  probepool = fData(target_Data)$Module,
  negnames = c("NegProbe-CTP01","NegProbe-Kilo","Negative Probe", "NegProbe-WTX" ))
  #negnames = "NegProbe-WTX")

9.2 Load cell profile

A “cell profile matrix” is a pre-defined matrix that specifies the expected expression profiles of each cell type in the experiment. The SpatialDecon library comes with one such matrix pre-loaded, the “SafeTME” matrix, designed for estimation of immune and stroma cells in the tumor microenvironment. (This matrix was designed to avoid genes commonly expressed by cancer cells; see the SpatialDecon manuscript for details.). Otherwise, load specific profiles from https://github.com/Nanostring-Biostats/CellProfileLibrary/tree/NewProfileMatrices

#safeTME
data("safeTME")
data("safeTME.matches")
current_cell_profile<-safeTME

#see: https://github.com/Nanostring-Biostats/CellProfileLibrary/tree/NewProfileMatrices

current_cell_profile <- download_profile_matrix(species = "Human",
                                               age_group = "Adult",
                                               matrixname = "Kidney_HCA")

heatmap(sweep(current_cell_profile, 1, apply(current_cell_profile, 1, max), "/"),
        labRow = NA, margins = c(10, 5), cexCol = 0.7)

9.3 Run spatial deconvolution

# vector identifying pure tumor segments:
#target_Data$istumor = target_Data$ANN3 == "CORE" & target_Data$ANN1 == "PanCK+"
res = runspatialdecon(object = target_Data,
                      norm_elt = "q_norm",
                      raw_elt = "exprs",
                      #is_pure_tumor = target_Data$istumor,
                      cell_counts = target_Data$nuclei,
                      X = current_cell_profile,
                      #cellmerges = safeTME.matches,              # safeTME.matches object, used by default
                      #n_tumor_clusters = 5,                      # how many distinct tumor profiles to append to safeTME
                      align_genes = TRUE)

9.3.1 Spatial deconvolution heatmaps

Abundance

# NOTE: check clustering.. why different?

#set display thresholds
thresh <- signif(quantile(res$beta, 0.97), 2)

# plot stored to keep clustering for later
p1<-pheatmap(pmin(t(res$beta),thresh),
         #scale = "row", 
         cutree_cols = 3,
         cutree_rows = 2,
         fontsize_row = 12,
         show_rownames = TRUE, show_colnames = TRUE,
         angle_col = "90",
         border_color = NA,
         #clustering_method = "average",
         #clustering_distance_rows = "correlation",
         #clustering_distance_cols = "correlation",
         legend_breaks = c(round(seq(0, thresh, length.out = 5))[-5], thresh),
         legend_labels = c(round(seq(0, thresh, length.out = 5))[-5], paste0("Abundance scores,\ntruncated above at ", thresh)),
         #breaks = seq(0, 5, 1),
         color = colorRampPalette(c("white","darkblue"))(100),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names]
         )

#p1

Proportional

# proportions:
props <- replace(res$prop_of_nontumor, is.na(res$prop_of_nontumor), 0)

p2<-pheatmap(t(props),
         #scale = "row", 
         cutree_cols = 3,
         cutree_rows = 2,
         fontsize_row = 12,
         show_rownames = TRUE, show_colnames = TRUE,
         angle_col = "90",
         border_color = NA,
         #clustering_method = "average",
         #clustering_distance_rows = "correlation",
         #clustering_distance_cols = "correlation",
         legend_breaks = round(seq(0, max(props) * 0.99, length.out = 5), 2),
         legend_labels = c(round(seq(0, max(props), length.out = 5), 2)[-5], "Proportion of all\nfitted populations"),
         color = colorRampPalette(c("white","darkblue"))(100),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])

#p2

Scaled

# scaled abundances:
epsilon <- min(res$beta[res$beta > 0])
mat <- sweep(res$beta, 1, pmax(apply(res$beta, 1, max), epsilon), "/")

pheatmap(t(mat),
         #scale = "row",
         cutree_cols = 3,
         cutree_rows = 3,
         fontsize_row = 12,
         show_rownames = TRUE, show_colnames = TRUE,
         angle_col = "90",
         border_color = NA,
         #clustering_method = "average",
         #clustering_distance_rows = "correlation",
         #clustering_distance_cols = "correlation",
         legend_breaks = c(round(seq(0, 1, length.out = 5), 2)[-5], 1),
         legend_labels = c(round(seq(0, 1, length.out = 5), 2)[-5], "Scaled abundance\n(ratio to max)"),
         color = colorRampPalette(c("white","darkblue"))(100),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])

9.4 Barplots

abundance

# define variables to show in heatmaps:
pData(target_Data)$region <- 
    factor(pData(target_Data)$ANN3, unique(target_Data$ANN3))   # Must be manual?
pData(target_Data)$class <- 
    factor(pData(target_Data)$ANN1, unique(target_Data$ANN1))   # Must be manual?

variables_to_plot <- c("slide_name", "ANN1", "ANN3")                     # Must be manual?

#define colors

# only for safeTME colors
#col <- cellcols

#custom celmatrix
#get large number of colors
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))))
celltypes<-sample(col_vector,length(colnames(res$beta)))
names(celltypes)<-colnames(res$beta)
col<-celltypes


#tempfix for abbreviated and now mismatching annotations
#can just use ann dataframe?
#tmpann<-cbind(ANN1,ANN2,SN)
tmpann <- pData(target_Data)[ann_names]



layout(matrix(c(1, 2, 3, 3), nrow = 2),
       widths = c(10, 3, 10, 3),
       heights = c(1, 8, 10),
      )

par(mar = c(0, 8.2, 0, 0.2))
plot(p1$tree_col, labels = F, main = "", ylab = "", yaxt = "n")
par(mar = c(15, 8, 0, 0))

# data to plot:
mat <- t(res$beta)[, p1$tree_col$order]
# infer scale of negative y-axis for annotation colorbars
ymin <- -max(colSums(mat)) * 0.15
if (!is.finite(ymin)) {
  ymin <- 0
}

# draw barplot:
bp <- barplot(mat,
              cex.lab = 1.5,
              col = col, border = NA,
              cex.names = 1.1,
              las = 2, main = "", ylab = "Abundance scores",
              ylim = c(ymin, max(colSums(mat)))
)


# add color bars for annotations
for (name in rev(variables_to_plot)) {
  yrange <- seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 1)[match(name, variables_to_plot) + c(0, 1)]
  xwidth <- (bp[2] - bp[1]) / 2
  for (i in 1:ncol(mat)) {
    rect(bp[i] - xwidth, yrange[2], bp[i] + xwidth, yrange[1],
         # border = NA, col = ann_colors[[name]][segmentAnnotations[match(colnames(mat)[i], segmentAnnotations$segmentID), name]]
         #border = NA, col = ann_colors[[name]][ann[p1$tree_col$order[i], name]]
         border = NA, col = color_list[[name]][tmpann[colnames(mat)[i], name]]
    )
  }
}
axis(2,
     at = seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 2)[-c(1, length(variables_to_plot) + 2)],
     las = 2, labels = variables_to_plot, lty = 0, cex.axis = 1.2
)

#draw a legend:
par(mar = c(0.1, 0.1, 0.1, 0.1))
frame()
legendcols <- legendnames <- c()
#for (name in rev(names(ann_colors))) {
for (name in c("slide_name", "ANN1", "ANN3")) {
  legendcols <- c(legendcols, NA, color_list[[name]], NA)
  legendnames <- c(legendnames, name, names(color_list[[name]]), NA)
}
legend("center",
       pch = 15,
       cex = 1.5,
       col = c(legendcols, rep(NA, 1), rev(col)),
       legend = c(legendnames, "Cell type", rev(names(col))),
)

proportional

# define variables to show in heatmaps:
variables_to_plot <- c("slide_name", "ANN1", "ANN3")

layout(matrix(c(1, 2, 3, 3), nrow = 2),
       widths = c(10, 3, 10, 3),
       heights = c(1, 8, 10),
      )

par(mar = c(0, 8.2, 0, 0.2))
plot(p2$tree_col, labels = F, main = "", ylab = "", yaxt = "n")
par(mar = c(15, 8, 0, 0))

# data to plot:
mat <- t(res$prop_of_nontumor)[, p2$tree_col$order]
  mat <- replace(mat, is.na(mat), 0)
  # infer scale of negative y-axis for annotation colorbars
  ymin <- -0.15

# draw barplot:
bp <- barplot(mat,
              cex.lab = 1.5,
              col = col, border = NA,
              cex.names = 1.1,
              las = 2, main = "", ylab = "Proportion of fitted cells",
              ylim = c(ymin, max(colSums(mat)))
)

# add color bars for annotations
for (name in rev(variables_to_plot)) {
  yrange <- seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 1)[match(name, variables_to_plot) + c(0, 1)]
  xwidth <- (bp[2] - bp[1]) / 2
  for (i in 1:ncol(mat)) {
    rect(bp[i] - xwidth, yrange[2], bp[i] + xwidth, yrange[1],
         # border = NA, col = ann_colors[[name]][segmentAnnotations[match(colnames(mat)[i], segmentAnnotations$segmentID), name]]
         #border = NA, col = ann_colors[[name]][ann[p2$tree_col$order[i], name]]
         border = NA, col = color_list[[name]][tmpann[colnames(mat)[i], name]]
    )
  }
}
axis(2,
     at = seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 2)[-c(1, length(variables_to_plot) + 2)],
     las = 2, labels = variables_to_plot, lty = 0, cex.axis = 1.2
)


#draw a legend:
par(mar = c(0.1, 0.1, 0.1, 0.1))
frame()
legendcols <- legendnames <- c()
#for (name in rev(names(ann_colors))) {
for (name in c("slide_name", "ANN1", "ANN3")) {
  legendcols <- c(legendcols, NA, color_list[[name]], NA)
  legendnames <- c(legendnames, name, names(color_list[[name]]), NA)
}
legend("center",
       pch = 15,
       cex = 1.4,
       col = c(legendcols, rep(NA, 1), rev(col)),
       legend = c(legendnames, "Cell type", rev(names(col))),
)

Boxplots

Boxplot per cell from the spatial deconvolution. Choose your annotation in which you want to compare (ANN). The annotation will be divided in conditions and each cell with have a boxplot which include the abundance score of each condition. A statistical test has been added inside the boxplot to check for significance. Feel free to chance the test to fit the data.

significance code p-value *** [0, 0.001] ** (0.001, 0.01] * (0.01, 0.05] . (0.05, 0.1] (0.1, 1]

ANN <- "ANN1" # ANN column with conditions
if (length(unique(target_Data[[ANN]])) > 2) {
  test <- "kruskal.test"
} else {
  test <- "wilcox.test"
}

res_filter <- res[!res@phenoData@data[["beta"]]==0]
res_filter <- as.data.frame(res_filter$beta)
res_filter[res_filter == 0] <- NA
boxplot_prep <- res_filter[ , colSums(is.na(res_filter)) < nrow(res_filter)] 

boxplot_prep$condition <- rownames(boxplot_prep)

for (value in unique(target_Data[[ANN]])) {
  con <- rownames(pData(target_Data)[pData(target_Data)[[ANN]] == value,])
  boxplot_prep$condition[boxplot_prep$condition %in% con] <- value
}

boxplot <- melt(boxplot_prep)
## Using condition as id variables
# function for number of observations 
give.n <- function(x){
  return(data.frame(y = -1, label = paste0("n = ",length(x))))
 #return(c(y = median(x)*1.05, label = length(x))
# experiment with the multiplier to find the perfect position
}

ggplot(boxplot, aes(factor(condition), value, fill = condition)) + 
  geom_boxplot() + 
  geom_jitter(position=position_jitter(w=0.1,h=0.1)) +
  facet_wrap(~variable, scale="free", ncol = 5) +
  theme(text = element_text(size = 20)) +
  rotate_x_text(angle = 45) +
  labs(
    title = "Spatial Deconvolution",
    x = "Conditions",
    y = "Abuncance score") + 
  stat_compare_means(method = test, label.y = -3, size=5) + 
  stat_compare_means(label = "p.signif", method = "t.test",
                     ref.group = ".all.", hide.ns = TRUE, size=5) +
  stat_summary(fun.data = give.n, geom = "text", size = 5)

#stat_compare_means(size=5)

11 CNV

Copy Number Variation analysis with the R software package inferCNV (https://github.com/broadinstitute/inferCNV/wiki).

InferCNV is used to explore tumor single cell RNA-Seq data to identify evidence for somatic large-scale chromosomal copy number alterations, such as gains or deletions of entire chromosomes or large segments of chromosomes. This is done by exploring expression intensity of genes across positions of tumor genome in comparison to a set of reference ‘normal’ cells. A heatmap is generated illustrating the relative expression intensities across each chromosome, and it often becomes readily apparent as to which regions of the tumor genome are over-abundant or less-abundant as compared to that of normal cells.

Per patient a cnv analysis with a provided reference group. The gene order is made from the projects .pkc file.

Select the annotation where the CNV analysis should look at specifically as group and specify a subgroup as group_filter if the group of interest is a subgroup inside the annotation. For example if the CNV analysis needs to be only on tumor regions the group would be ANNX(ANN column with the info about the tumor regions) and group_filter PanCK+. The annotation that references the patients as patients. If there are no patients leave the parameter empty as ““. The annotation that should be included in the results including a reference set as cnv_target. This will create patient specific files with all the information inferCNV needs.

Select the reference set in reference. This can be more than one. Every patient without its own reference will not be analysed.

#########PARAMETERS########
reference <- c("normal")
###########################

failed <- c()
cnv_list <- list()

for (patient in patient_list) {
  # Name output folder
  out_dir <- paste0(patient, "_CNV")
  if (str_detect(paste(readLines(paste0(patient, ".Annotations.tsv")), collapse = ''), reference) == FALSE) {
    failed <- c(failed, patient)
    next
    }

  # Create the infercnv object
  infercnv_obj = CreateInfercnvObject(raw_counts_matrix= paste0(patient, ".Counts.tsv"),
                                      annotations_file= paste0(patient, ".Annotations.tsv"),
                                      delim="\t",
                                      gene_order_file= "gene_order_Hs_WTA_v1_pkc.txt",
                                      #gene_order_file= "gencode_v19_gene_pos.txt",
                                      ref_group_names= reference, # input the normal/reference group names
                                      chr_exclude = c("chrM"))#c("chrM")) # Default excludes chrX, chrY and chrM. By only picking chrM you include the X and Y chromosomes.

  # perform infercnv operations to reveal cnv signal. For all options: https://rdrr.io/github/broadinstitute/infercnv/man/run.html
  #cnv_list[[patient]]
  infercnv_obj <- infercnv::run(infercnv_obj,
                               cutoff=0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                               out_dir= out_dir,  # dir is auto-created for storing outputs
                               cluster_by_groups=FALSE,   # cluster
                               denoise=TRUE,
                               HMM=FALSE,
                               tumor_subcluster_partition_method = c("qnorm"),
                               analysis_mode = "subclusters",
                               no_plot=TRUE
                               #,debug=TRUE
                               )

  plot_cnv(infercnv_obj,
          out_dir = paste0(patient, "_CNV"),
          title = "inferCNV",
          obs_title = "Observations (Cells)",
          ref_title = "References (Cells)",
          cluster_by_groups = FALSE,
          cluster_references = FALSE,
          plot_chr_scale = FALSE,
          #chr_lengths = NULL,
          k_obs_groups = 1,
          contig_cex = 1.5,
          #x.center = mean(infercnv_obj@expr.data),
          x.range = "auto",
          #hclust_method = "ward.D",
          output_filename = "infercnv",
          output_format = "png",
          png_res = 300
          )
}
for (patient in patient_list) {
  print(paste(group_filter, patient))
  if (patient %in% failed) {
    print(" ^    Patient contained no reference group")
    next
  }
  img <- readPNG(paste(paste0(patient, "_CNV"), "/infercnv.png", sep = ""))
  grid::grid.newpage()
  grid::grid.raster(img)
}
## [1] " dummy"

11.1 Dendrogram

Closer look at the dendrogram from the CNV analysis per patient where the nodes as written numbers. Use the numbers to select subgroups for further analysis in ‘t-test on two subgroups’.

#out_dir <- "T1_NANO_012_CNV"
trees <- list()

for (patient in patient_list) {
  if (patient %in% failed) {
    #print("Patient contained no reference group")
    next
  }
tree <- read.tree(paste(patient,"_CNV/infercnv.observations_dendrogram.txt", sep = ""))
obv <- read.csv(paste(patient,"_CNV/infercnv.observation_groupings.txt", sep = ""), sep="")
trees[[patient]] <- ggtree(
  tree, ladderize=F) +
  geom_treescale() +
  geom_tiplab(color=obv$Annotation.Color, hjust=-.2) +
  coord_cartesian(clip = 'off') +
  theme_tree2(plot.margin=margin(6, 200, 6, 6)) +
  geom_text2(aes(label=node), hjust=-.3, size = 3) +
  ggplot2::labs(title = patient)
}

grid.arrange(grobs=trees,ncol=3)

t-test chr with groups

Compare the contrast within a chromosome. Select a chromosome, contrast, and patient of interest to run a t-test on.

#########PARAMETERS########
chr <- "chrX" # Select chromosome of interest
contrast <- c("normal", "DKD") # Select contrast
patient <- "dummy"
###########################

positions <- as.data.frame(positions)
colnames(positions) <- c("gene", "chr", "begin", "end")
select_genes <- positions$gene[positions$chr == chr] # Grab chr specific genes

annotation <- as.data.frame(read.delim(paste0(patient, ".Annotations.tsv"), header=FALSE))
colnames(annotation) <- c("Sample_ID", "ANN")
select_samples <- annotation

filter_counts <- as.data.frame(target_Data@assayData[["log_q"]])
colnames(filter_counts) <- gsub('.dcc','', colnames(filter_counts))
filter_counts <- filter_counts[,select_samples$Sample_ID] # filter out samples that are not the interesting region

select_genes <- select_genes[select_genes %in% rownames(filter_counts)] # Only use genes that are actually in the data (pkc gene file has all of them)
filter_counts <- filter_counts[select_genes,] # filter out non chromosome specific genes
plots<-list()
tables<-list()
labels<-list()
test<-"ttest"
mtc<-"BH"
counter=1

log_q_filter <-as.data.frame(filter_counts)

comps_df<-data.frame(comp='',val='')

for (active_group1 in contrast) {
    for (active_group2 in contrast) {

      #supress reduncant compares
      if(active_group1==active_group2) {next}
      comp<-paste(sort(c(active_group1,active_group2)),collapse = "_")
      #print(comp)
      if(comp %in% comps_df$comp) {next}
      temp_df<-data.frame(comp=comp ,val=1)
      comps_df<-rbind(comps_df,temp_df)

      labels[[counter]]<-paste(active_group1," vs ", active_group2)
      group1<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group1]]
      group2<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group2]]

      #run t_tests
      results<-as.data.frame ( apply(log_q_filter, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) )
      colnames(results)<-"raw_p_value"

      #multiple_testing_correction
      adj_p_value<- p.adjust(results$raw_p_value,method=mtc)
      results<-cbind(results,adj_p_value)

      #calc_fdr
      FDR<- p.adjust(results$raw_p_value,method="fdr")
      results<-cbind(results,FDR)

      #fold_changes
      #as base data is already log transformed, means need to be subtracted to get FC in log space
      fchanges<-as.data.frame( apply(log_q_filter, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]) ) ) )
      colnames(fchanges)<-"FC"
      results<-cbind(results,fchanges)

      #add genenames
      results$Gene<-rownames(results)

      #set categories based on P-value & FDR for plotting
      results$Color <- "NS or FC < 0.5"
      results$Color[results$adj_p_value < 0.05] <- "P < 0.05"
      results$Color[results$FDR < 0.05] <- "FDR < 0.05"
      results$Color[results$FDR < 0.001] <- "FDR < 0.001"
      results$Color[abs(results$FC) < 0.5] <- "NS or FC < 0.5"
      results$Color <- factor(results$Color,
                              levels = c("NS or FC < 0.5", "P < 0.05", "FDR < 0.05", "FDR < 0.001"))

      #vulcanoplot

      # pick top genes for either side of volcano to label
      # order genes for convenience:

      results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC)
      top_g <- c()
      top_g <- c(top_g,
                 results[ind, 'Gene'][
                   order(results[ind, 'invert_P'], decreasing = TRUE)[1:15]],
                 results[ind, 'Gene'][order(results[ind, 'invert_P'], decreasing = FALSE)[1:15]])
      top_g<- unique(top_g)
      results <- results[, -1*ncol(results)] # remove invert_P from matrix

      # Graph results
      plots[[counter]]<- ggplot(results,
                                      aes(x = FC, y = -log10(adj_p_value),
                                          color = Color, label = Gene)) +
        geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") +
        geom_hline(yintercept = -log10(0.05), lty = "dashed") +
        geom_point() +
        labs(x = paste("Enriched genes in", chr, "-", active_group2," <- log2(FC) -> Enriched in", active_group1),
             y = "Significance, -log10(P)",
             color = "Significance") +
        scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                      `FDR < 0.05` = "lightblue",
                                      `P < 0.05` = "orange2",
                                      `NS or FC < 0.5` = "gray"),
                           guide = guide_legend(override.aes = list(size = 4))) +
        scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
        geom_text_repel(data = subset(results, FDR<0.05 & (-0.5>FC| FC>0.5)),
                        point.padding = 0.15, color = "black", size=3.5,
                        min.segment.length = .1, box.padding = .2, lwd = 2,
                        max.overlaps = 50) +
        theme_bw(base_size = 20) +
        theme(legend.position = "bottom") +
        #ggtitle(paste(slide,": ", test, mtc,"multitest corr"))
        ggtitle(paste(patient, ": ", test, mtc,"multitest corr"))

      #store tables for display later
      tables[[counter]]<-results

      counter = counter+1
      #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
    }
  }

grid.arrange(grobs=plots,ncol=2)

t-test on two subgroups

Select nodes from the dendrogram as subgroups, in this case nodes 18 and 15. Decide on the involved chromosomes and add them into the chr_list. Specify the patient in patient.

#########PARAMETERS########
chr_list <- c("chr22") # Choose needed chromosomes of the target area. Think of it as the x-axis
patient <- "dummy"
node_interest <- 233 # Choose subgroup 1
node_interest2 <- 216 # Choose subgroup 2
###########################

tree <- read.tree(paste(patient,"_CNV/infercnv.observations_dendrogram.txt", sep = ""))
tree_info <- tree %>% as.treedata %>% as_tibble # transform tree into accessible data

samples_interest <- offspring(tree_info, node_interest) # Get labels attached to the group
samples_interest <- as.character(na.omit(samples_interest$label)) # formatting
paste("Subgroup 1: ", samples_interest)
##  [1] "Subgroup 1:  DSP-1001250007864-D-H08"
##  [2] "Subgroup 1:  DSP-1001250007864-D-H10"
##  [3] "Subgroup 1:  DSP-1001250007864-D-H06"
##  [4] "Subgroup 1:  DSP-1001250007864-D-H02"
##  [5] "Subgroup 1:  DSP-1001250007864-D-H04"
##  [6] "Subgroup 1:  DSP-1001250007864-D-H11"
##  [7] "Subgroup 1:  DSP-1001250007864-D-H12"
##  [8] "Subgroup 1:  DSP-1001250007864-D-H05"
##  [9] "Subgroup 1:  DSP-1001250007864-D-H07"
## [10] "Subgroup 1:  DSP-1001250007864-D-H09"
## [11] "Subgroup 1:  DSP-1001250007864-D-H01"
## [12] "Subgroup 1:  DSP-1001250007864-D-H03"
## [13] "Subgroup 1:  DSP-1001250007851-H-A09"
## [14] "Subgroup 1:  DSP-1001250007851-H-C04"
## [15] "Subgroup 1:  DSP-1001250007851-H-D06"
## [16] "Subgroup 1:  DSP-1001250007851-H-C07"
## [17] "Subgroup 1:  DSP-1001250007851-H-D12"
## [18] "Subgroup 1:  DSP-1001250007851-H-A12"
## [19] "Subgroup 1:  DSP-1001250007851-H-A03"
## [20] "Subgroup 1:  DSP-1001250007851-H-D09"
## [21] "Subgroup 1:  DSP-1001250007851-H-A04"
## [22] "Subgroup 1:  DSP-1001250007851-H-D10"
## [23] "Subgroup 1:  DSP-1001250007851-H-A10"
## [24] "Subgroup 1:  DSP-1001250007851-H-C01"
## [25] "Subgroup 1:  DSP-1001250007851-H-C03"
## [26] "Subgroup 1:  DSP-1001250007851-H-A05"
## [27] "Subgroup 1:  DSP-1001250007851-H-C02"
## [28] "Subgroup 1:  DSP-1001250007851-H-B12"
## [29] "Subgroup 1:  DSP-1001250007851-H-B06"
## [30] "Subgroup 1:  DSP-1001250007851-H-B11"
## [31] "Subgroup 1:  DSP-1001250007851-H-B04"
## [32] "Subgroup 1:  DSP-1001250007851-H-B05"
## [33] "Subgroup 1:  DSP-1001250007851-H-D11"
## [34] "Subgroup 1:  DSP-1001250007851-H-D05"
## [35] "Subgroup 1:  DSP-1001250007851-H-B07"
## [36] "Subgroup 1:  DSP-1001250007851-H-C08"
## [37] "Subgroup 1:  DSP-1001250007851-H-D08"
## [38] "Subgroup 1:  DSP-1001250007851-H-B10"
## [39] "Subgroup 1:  DSP-1001250007851-H-A02"
## [40] "Subgroup 1:  DSP-1001250007851-H-A08"
## [41] "Subgroup 1:  DSP-1001250007851-H-A06"
## [42] "Subgroup 1:  DSP-1001250007851-H-A11"
## [43] "Subgroup 1:  DSP-1001250007851-H-D04"
## [44] "Subgroup 1:  DSP-1001250007851-H-D07"
## [45] "Subgroup 1:  DSP-1001250007851-H-B01"
## [46] "Subgroup 1:  DSP-1001250007851-H-B08"
## [47] "Subgroup 1:  DSP-1001250007851-H-A07"
## [48] "Subgroup 1:  DSP-1001250007851-H-B02"
## [49] "Subgroup 1:  DSP-1001250007851-H-B03"
## [50] "Subgroup 1:  DSP-1001250007851-H-C06"
## [51] "Subgroup 1:  DSP-1001250007851-H-D01"
## [52] "Subgroup 1:  DSP-1001250007851-H-C12"
## [53] "Subgroup 1:  DSP-1001250007851-H-C09"
## [54] "Subgroup 1:  DSP-1001250007851-H-C05"
## [55] "Subgroup 1:  DSP-1001250007851-H-C11"
## [56] "Subgroup 1:  DSP-1001250007851-H-C10"
## [57] "Subgroup 1:  DSP-1001250007851-H-B09"
## [58] "Subgroup 1:  DSP-1001250007851-H-D03"
## [59] "Subgroup 1:  DSP-1001250007851-H-D02"
## [60] "Subgroup 1:  DSP-1001250007868-B-A02"
samples_interest2 <- offspring(tree_info, node_interest2)
samples_interest2 <- as.character(na.omit(samples_interest2$label))
paste("Subgroup 2: ", samples_interest2)
##  [1] "Subgroup 2:  DSP-1001250007851-H-F10"
##  [2] "Subgroup 2:  DSP-1001250007851-H-F04"
##  [3] "Subgroup 2:  DSP-1001250007851-H-F11"
##  [4] "Subgroup 2:  DSP-1001250007851-H-F01"
##  [5] "Subgroup 2:  DSP-1001250007851-H-F05"
##  [6] "Subgroup 2:  DSP-1001250007851-H-E08"
##  [7] "Subgroup 2:  DSP-1001250007851-H-F03"
##  [8] "Subgroup 2:  DSP-1001250007851-H-F02"
##  [9] "Subgroup 2:  DSP-1001250007851-H-F09"
## [10] "Subgroup 2:  DSP-1001250007851-H-F06"
## [11] "Subgroup 2:  DSP-1001250007851-H-F07"
## [12] "Subgroup 2:  DSP-1001250007868-B-A05"
## [13] "Subgroup 2:  DSP-1001250007868-B-A03"
## [14] "Subgroup 2:  DSP-1001250007868-B-B01"
## [15] "Subgroup 2:  DSP-1001250007851-H-F12"
## [16] "Subgroup 2:  DSP-1001250007868-B-A04"
## [17] "Subgroup 2:  DSP-1002510007866-C-G02"
## [18] "Subgroup 2:  DSP-1002510007866-C-H12"
positions <- as.data.frame(positions)
colnames(positions) <- c("gene", "chr", "begin", "end") # Formatting

select_genes <- positions$gene[positions$chr %in% chr_list] # | positions$chr == chr2] # Grab chromosome specific genes

annotation <- as.data.frame(read.delim(paste0(patient, ".Annotations.tsv"), header=FALSE))
colnames(annotation) <- c("Sample_ID", "ANN")
select_samples <- annotation[annotation$Sample_ID %in% samples_interest | annotation$Sample_ID %in% samples_interest2,] # Grab subgroup specific Sample IDs

filter_counts <- target_Data@assayData[["log_q"]]
colnames(filter_counts) <- gsub('.dcc','', colnames(filter_counts))
filter_counts <- filter_counts[,select_samples$Sample_ID] # Filter Sample ID's

select_genes <- select_genes[select_genes %in% rownames(filter_counts)] # Only use genes that are actually in the data (pkc gene file has all of them)
filter_counts <- filter_counts[select_genes,] # Filter genes
plots<-list()
tables<-list()
labels<-list()
test<-"ttest"
mtc<-"BH"
counter=1

log_q_filter <-as.data.frame(filter_counts)

comps_df<-data.frame(comp='',val='')


active_group1 <- samples_interest #subgroup 1
active_group2 <- samples_interest2 #subgroup 2

# for (active_group1 in c("sub1")) {
#     for (active_group2 in c("sub2")) {

      #supress reduncant compares
      #if(active_group1==active_group2) {next}
      #comp<-paste(sort(c(active_group1,active_group2)),collapse = "_")
      #print(comp)
      #if(comp %in% comps_df$comp) {next}
      temp_df<-data.frame(comp=comp ,val=1)
      comps_df<-rbind(comps_df,temp_df)

      # labels[[counter]]<-paste(active_group1," vs ", active_group2)
      # group1<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group1]]
      # group2<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group2]]

      labels[[counter]]<-paste(active_group1," vs ", active_group2)
      group1<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$Sample_ID %in% active_group1]]
      group2<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$Sample_ID %in% active_group2]]

      #run t_tests
      results<-as.data.frame ( apply(log_q_filter, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) )
      colnames(results)<-"raw_p_value"

      #multiple_testing_correction
      adj_p_value<- p.adjust(results$raw_p_value,method=mtc)
      results<-cbind(results,adj_p_value)

      #calc_fdr
      FDR<- p.adjust(results$raw_p_value,method="fdr")
      results<-cbind(results,FDR)

      #fold_changes
      #as base data is already log transformed, means need to be subtracted to get FC in log space
      fchanges<-as.data.frame( apply(log_q_filter, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]) ) ) )
      colnames(fchanges)<-"FC"
      results<-cbind(results,fchanges)

      #add genenames
      results$Gene<-rownames(results)

      #set categories based on P-value & FDR for plotting
      results$Color <- "NS or FC < 0.5"
      results$Color[results$adj_p_value < 0.05] <- "P < 0.05"
      results$Color[results$FDR < 0.05] <- "FDR < 0.05"
      results$Color[results$FDR < 0.001] <- "FDR < 0.001"
      results$Color[abs(results$FC) < 0.5] <- "NS or FC < 0.5"
      results$Color <- factor(results$Color,
                              levels = c("NS or FC < 0.5", "P < 0.05", "FDR < 0.05", "FDR < 0.001"))

      #vulcanoplot

      # pick top genes for either side of volcano to label
      # order genes for convenience:

      results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC)
      top_g <- c()
      top_g <- c(top_g,
                 results[ind, 'Gene'][
                   order(results[ind, 'invert_P'], decreasing = TRUE)[1:15]],
                 results[ind, 'Gene'][order(results[ind, 'invert_P'], decreasing = FALSE)[1:15]])
      top_g<- unique(top_g)
      results <- results[, -1*ncol(results)] # remove invert_P from matrix

      # Graph results
      #plots[[counter]]<- ggplot(results,
      p <- ggplot(results,
                                      aes(x = FC, y = -log10(adj_p_value),
                                          color = Color, label = Gene)) +
        geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") +
        geom_hline(yintercept = -log10(0.05), lty = "dashed") +
        geom_point() +
        labs(x = paste("Enriched genes from", chr_list, "in", "subgroup 2"," <- log2(FC) -> Enriched in", "subgroup 1"),
             y = "Significance, -log10(P)",
             color = "Significance") +
        scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                      `FDR < 0.05` = "lightblue",
                                      `P < 0.05` = "orange2",
                                      `NS or FC < 0.5` = "gray"),
                           guide = guide_legend(override.aes = list(size = 4))) +
        scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
        geom_text_repel(data = subset(results, FDR<0.05 & (-0.5>FC| FC>0.5)),
                        point.padding = 0.15, color = "black", size=3.5,
                        min.segment.length = .1, box.padding = .2, lwd = 2,
                        max.overlaps = 50) +
        theme_bw(base_size = 20) +
        theme(legend.position = "bottom") +
        #ggtitle(paste(slide,": ", test, mtc,"multitest corr"))
        ggtitle(paste(patient, ": ", test, mtc,"multitest corr"))

      #store tables for display later
      tables[[counter]]<-results

      counter = counter+1
      #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
  #   }
  # }

#ggplotly(p)
p

#grid.arrange(grobs=plots,ncol=2)

12 Code & Versions

Pipelineversion: v1 based on: https://bioconductor.org/packages/devel/workflows/vignettes/GeoMxWorkflows/inst/doc/GeomxTools_RNA-NGS_Analysis.html

The underlying code can be downloaded from the ‘Code’, button on the top of this page. Choose option ‘download Rmd’ to download the full pipeline which can be opened in R or Rstudio. Some filepaths are hardcoded and need to be changed according to your setup.

12.1 R session information

sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Netherlands.utf8  LC_CTYPE=English_Netherlands.utf8   
## [3] LC_MONETARY=English_Netherlands.utf8 LC_NUMERIC=C                        
## [5] LC_TIME=English_Netherlands.utf8    
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0                tidytree_0.4.2             
##  [3] ggtree_3.6.2                Biostrings_2.66.0          
##  [5] XVector_0.38.0              ape_5.6-2                  
##  [7] infercnv_1.14.0             preprocessCore_1.60.1      
##  [9] sva_3.46.0                  BiocParallel_1.32.1        
## [11] genefilter_1.80.1           mgcv_1.8-40                
## [13] nlme_3.1-157                scater_1.26.1              
## [15] scuttle_1.8.1               ggalluvial_0.12.3          
## [17] limma_3.54.0                SpatialExperiment_1.8.0    
## [19] SingleCellExperiment_1.20.0 SummarizedExperiment_1.28.0
## [21] GenomicRanges_1.50.1        GenomeInfoDb_1.34.3        
## [23] IRanges_2.32.0              MatrixGenerics_1.10.0      
## [25] matrixStats_0.63.0          standR_1.3.1               
## [27] viridis_0.6.2               viridisLite_0.4.1          
## [29] readxl_1.4.1                png_0.1-7                  
## [31] pathview_1.38.0             fgsea_1.24.0               
## [33] clusterProfiler_4.6.2       lubridate_1.9.2            
## [35] forcats_1.0.0               stringr_1.5.0              
## [37] purrr_1.0.1                 readr_2.1.4                
## [39] tidyr_1.3.0                 tibble_3.1.8               
## [41] tidyverse_2.0.0             gt_0.8.0                   
## [43] SpatialOmicsOverlay_0.99.11 RColorBrewer_1.1-3         
## [45] gridExtra_2.3               plotly_4.10.1              
## [47] DT_0.26                     ggrepel_0.9.2              
## [49] pheatmap_1.0.12             Rtsne_0.16                 
## [51] umap_0.2.9.0                cowplot_1.1.1              
## [53] reshape2_1.4.4              scales_1.2.1               
## [55] ggforce_0.4.1               dplyr_1.1.0                
## [57] knitr_1.41                  msigdbr_7.5.1              
## [59] GSVA_1.46.0                 SpatialDecon_1.8.0         
## [61] GeoMxWorkflows_1.5.0        GeomxTools_3.2.0           
## [63] NanoStringNCTools_1.6.0     ggplot2_3.4.1              
## [65] S4Vectors_0.36.0            Biobase_2.58.0             
## [67] BiocGenerics_0.44.0        
## 
## loaded via a namespace (and not attached):
##   [1] graphlayouts_0.8.3        pbapply_1.6-0            
##   [3] lattice_0.20-45           rJava_1.0-6              
##   [5] vctrs_0.6.0               blob_1.2.3               
##   [7] survival_3.3-1            spatstat.data_3.0-0      
##   [9] later_1.3.0               nloptr_2.0.3             
##  [11] DBI_1.1.3                 R.utils_2.12.2           
##  [13] rappdirs_0.3.3            uwot_0.1.14              
##  [15] dqrng_0.3.0               jpeg_0.1-9               
##  [17] zlibbioc_1.44.0           htmlwidgets_1.5.4        
##  [19] mvtnorm_1.1-3             future_1.29.0            
##  [21] leiden_0.4.3              logNormReg_0.5-0         
##  [23] parallel_4.2.1            irlba_2.3.5.1            
##  [25] tidygraph_1.2.2           Rcpp_1.0.9               
##  [27] KernSmooth_2.23-20        promises_1.2.0.1         
##  [29] DelayedArray_0.24.0       RcppParallel_5.1.5       
##  [31] magick_2.7.3              graph_1.76.0             
##  [33] RSpectra_0.16-1           fastmatch_1.1-3          
##  [35] rjags_4-13                digest_0.6.30            
##  [37] sctransform_0.3.5         scatterpie_0.1.8         
##  [39] DOSE_3.24.2               ggraph_2.1.0             
##  [41] pkgconfig_2.0.3           GO.db_3.16.0             
##  [43] spatstat.random_3.0-1     DelayedMatrixStats_1.20.0
##  [45] ggbeeswarm_0.6.0          iterators_1.0.14         
##  [47] minqa_1.2.5               DropletUtils_1.18.0      
##  [49] reticulate_1.26           beeswarm_0.4.0           
##  [51] modeltools_0.2-23         xfun_0.35                
##  [53] bslib_0.4.1               zoo_1.8-11               
##  [55] tidyselect_1.2.0          ica_1.0-3                
##  [57] gson_0.1.0                snow_0.4-4               
##  [59] rlang_1.1.0               jquerylib_0.1.4          
##  [61] glue_1.6.2                EBImage_4.40.0           
##  [63] lambda.r_1.2.4            ggsignif_0.6.4           
##  [65] labeling_0.4.2            GGally_2.1.2             
##  [67] httpuv_1.6.6              repmis_0.5               
##  [69] BiocNeighbors_1.16.0      TH.data_1.1-1            
##  [71] annotate_1.76.0           jsonlite_1.8.3           
##  [73] bit_4.0.5                 mime_0.12                
##  [75] systemfonts_1.0.4         gplots_3.1.3             
##  [77] BiocStyle_2.26.0          stringi_1.7.8            
##  [79] spatstat.sparse_3.0-0     scattermore_0.8          
##  [81] spatstat.explore_3.0-5    yulab.utils_0.0.6        
##  [83] bitops_1.0-7              cli_3.4.1                
##  [85] rhdf5filters_1.10.0       RSQLite_2.2.18           
##  [87] libcoin_1.0-9             data.table_1.14.6        
##  [89] KEGGgraph_1.58.0          timechange_0.1.1         
##  [91] rstudioapi_0.14           fftwtools_0.9-11         
##  [93] qvalue_2.30.0             fastcluster_1.2.3        
##  [95] locfit_1.5-9.6            listenv_0.8.0            
##  [97] ggthemes_4.2.4            miniUI_0.1.1.1           
##  [99] gridGraphics_0.5-1        R.oo_1.25.0              
## [101] dbplyr_2.3.1              lifecycle_1.0.3          
## [103] munsell_0.5.0             cellranger_1.1.0         
## [105] R.methodsS3_1.8.2         caTools_1.18.2           
## [107] codetools_0.2-18          coda_0.19-4              
## [109] vipor_0.4.5               lmtest_0.9-40            
## [111] xtable_1.8-4              ROCR_1.0-11              
## [113] formatR_1.14              BiocManager_1.30.19      
## [115] abind_1.4-5               farver_2.1.1             
## [117] parallelly_1.32.1         RANN_2.6.1               
## [119] aplot_0.1.10              askpass_1.1              
## [121] tiff_0.1-11               parallelDist_0.2.6       
## [123] SeuratObject_4.1.3        RcppAnnoy_0.0.20         
## [125] goftest_1.2-3             patchwork_1.1.2          
## [127] futile.options_1.0.1      cluster_2.1.3            
## [129] future.apply_1.10.0       Seurat_4.3.0             
## [131] Matrix_1.5-3              ellipsis_0.3.2           
## [133] ggridges_0.5.4            igraph_1.3.5             
## [135] lmerTest_3.1-3            argparse_2.2.1           
## [137] spatstat.utils_3.0-1      htmltools_0.5.3          
## [139] BiocFileCache_2.6.0       yaml_2.3.6               
## [141] utf8_1.2.2                XML_3.99-0.12            
## [143] withr_2.5.0               fitdistrplus_1.1-8       
## [145] bit64_4.0.5               multcomp_1.4-20          
## [147] foreach_1.5.2             outliers_0.15            
## [149] progressr_0.11.0          GOSemSim_2.24.0          
## [151] rsvd_1.0.5                ScaledMatrix_1.6.0       
## [153] memoise_2.0.1             evaluate_0.18            
## [155] tzdb_0.3.0                curl_4.3.3               
## [157] fansi_1.0.3               highr_0.9                
## [159] ggiraph_0.8.4             GSEABase_1.60.0          
## [161] tensor_1.5                edgeR_3.40.0             
## [163] cachem_1.0.6              org.Hs.eg.db_3.16.0      
## [165] deldir_1.0-6              HDO.db_0.99.1            
## [167] babelgene_22.9            rjson_0.2.21             
## [169] rstatix_0.7.2             tools_4.2.1              
## [171] sass_0.4.3                sandwich_3.0-2           
## [173] magrittr_2.0.3            RCurl_1.98-1.9           
## [175] car_3.1-1                 phyclust_0.1-32          
## [177] ggplotify_0.1.0           xml2_1.3.3               
## [179] httr_1.4.5                RBioFormats_0.99.9       
## [181] rmarkdown_2.18            boot_1.3-28              
## [183] globals_0.16.2            R6_2.5.1                 
## [185] Rhdf5lib_1.20.0           KEGGREST_1.38.0          
## [187] treeio_1.22.0             gtools_3.9.4             
## [189] coin_1.4-2                beachmat_2.14.0          
## [191] HDF5Array_1.26.0          BiocSingular_1.14.0      
## [193] rhdf5_2.42.0              splines_4.2.1            
## [195] carData_3.0-5             ggfun_0.0.9              
## [197] colorspace_2.0-3          generics_0.1.3           
## [199] base64enc_0.1-3           gridtext_0.1.5           
## [201] pillar_1.8.1              Rgraphviz_2.42.0         
## [203] tweenr_2.0.2              sp_1.5-1                 
## [205] uuid_1.1-0                R.cache_0.16.0           
## [207] GenomeInfoDbData_1.2.9    plyr_1.8.8               
## [209] gtable_0.3.1              futile.logger_1.4.3      
## [211] shadowtext_0.1.2          fastmap_1.1.0            
## [213] EnvStats_2.7.0            crosstalk_1.2.0          
## [215] doParallel_1.0.17         AnnotationDbi_1.60.0     
## [217] broom_1.0.4               backports_1.4.1          
## [219] openssl_2.0.4             filelock_1.0.2           
## [221] plotrix_3.8-2             lme4_1.1-31              
## [223] enrichplot_1.18.3         hms_1.1.2                
## [225] shiny_1.7.3               polyclip_1.10-4          
## [227] grid_4.2.1                numDeriv_2016.8-1.1      
## [229] lazyeval_0.2.2            crayon_1.5.2             
## [231] MASS_7.3-57               downloader_0.4           
## [233] sparseMatrixStats_1.10.0  reshape_0.8.9            
## [235] compiler_4.2.1            ggtext_0.1.2             
## [237] spatstat.geom_3.0-3

12.2 References

1 Levi van Hijfte, Marjolein Geurts, Wies R. Vallentgoed, Paul H.C. Eilers, Peter A.E. Sillevis Smitt, Reno Debets, Pim J. French, Alternative normalization and analysis pipeline to address systematic bias in NanoString GeoMx Digital Spatial Profiling data, iScience, Volume 26, Issue 1, 2023, 105760, ISSN 2589-0042, https://doi.org/10.1016/j.isci.2022.105760.

2 Zhao, Y., Wong, L. & Goh, W.W.B. How to do quantile normalization correctly for gene expression data analyses. Sci Rep 10, 15534 (2020). https://doi.org/10.1038/s41598-020-72664-6

knitr::knit_exit()
---
title: "Nanostring GeoMx analysis"
author: "Ies Nijman & Pim Kloosterman"
output: 
  html_document:
    code_download: true
    code_folding: hide
    toc: true
    toc_float:
      collapsed: false
      smooth_scroll: true
    toc_depth: 3
editor_options: 
  markdown: 
    wrap: 72
  chunk_output_type: inline
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
```

![](http://useq.nl/wp-content/uploads/2022/12/USEQ-logo-subtitle.png)

# Klant: Useq

# Project: Spatial Organ Atlas

# Dataset: Kidney

**date: `r format(Sys.time(), '%H %M %a %d %B, %Y')`**

![](http://useq.nl/wp-content/uploads/2022/12/decoration-stroke-flat.png)

**loading dependencies** Please make sure the following packages are
installed and required libraries can be loaded:

-   install.packages("pkgbuild") // pkgbuild::check_build_tools()
-   install.packages("devtools")
-   devtools::install_github("Nanostring-Biostats/NanoStringNCTools")
-   devtools::install_github("Nanostring-Biostats/GeomxTools", ref =
    "dev")
-   BiocManager::install("GeoMxWorkflows")
-   devtools::install_github("DavisLaboratory/standR")
-   BiocManager::install("SpatialDecon")
-   BiocManager::install("GSVA")
-   install.packages("plotly")
-   install.packages("DT")
-   install.packages("msigdbr")
-   install.packages("digest")
-   install.packages("rmarkdown")
-   install.packages("kable")
-   BiocManager::install("EBImage")
-   install.packages('scattermore')
-   install.packages('pbapply')
-   install.packages('plotrix')
-   install.packages('ggtext')
-   install.packages('RBioFormats')
-   BiocManager::install("aoles/RBioFormats")
-   install.packages("C:/Users/inijman/Downloads/SpatialOmicsOverlay-0.99.13-beta.tar.gz",
    dependencies = TRUE, repos = NULL)
-   devtools::install_github("DavisLaboratory/standR")
-   BiocManager::install("clusterProfiler")
-   BiocManager::install("pathview")

```{r load_libraries, message=FALSE, warning=FALSE}

#load libraries

#stack_size <- getOption("pandoc.stack.size", default = "512m")
#options(java.parameters = c("-XX:+UseConcMarkSweepGC", "-Xmx8192m, -XX:MetaspaceSize=1024M"))
options(java.parameters = c("-XX:+UseConcMarkSweepGC", "-Xmx8192m"))
library(NanoStringNCTools)
library(GeomxTools)
library(GeoMxWorkflows)
library(SpatialDecon)
library(GSVA) #for pathway analyses
library(msigdbr) #for molecular signatures in pathway analyses
library(knitr)
library(dplyr)
library(ggforce)
library(ggplot2)
library(scales) # for percent
library(reshape2)  # for melt
library(cowplot)   # for plot_grid
library(umap)
library(Rtsne)
library(pheatmap)  # for pheatmap
library(ggrepel) 
library(scales) #for ggplot peaudolog to prevent errors on log(0)
library(DT)
library(plotly)
library(gridExtra)
library(RColorBrewer)
library(SpatialOmicsOverlay)
library(gt)
library(tidyverse)
library(clusterProfiler)
library(fgsea)
library(pathview)
library(png)
library(readxl)
library(viridis)

library(standR)
library(SpatialExperiment)
library(limma)
library(ggalluvial)
library(scater)

#BiocManager::install("sva")
library(sva)

#BiocManager::install("preprocessCore") #quantile norm
library(preprocessCore)

#install.packages("Polychrome") #Get colors
#library(Polychrome)


library(infercnv) # CNV
library("ape")
library("Biostrings")
library("ggtree")
library(tidytree)
library(ggpubr)
```

# 1 loading base files

```{r loading_base_data}
# Reference the main folder 'file.path' containing the sub-folders with each data file type:
datadir<-file.path("L:/pkloosterman/Github/DKD_Kidney/")
#datadir<-file.path("E:/DKD_Kidnet/")
```

To locate a specific file path replace the above line with datadir \<-
file.path("\~/Folder/SubFolder/DataLocation") replace the Folder,
SubFolder, DataLocation as needed. The DataLocation folder should
contain a dccs, pkcs, and annotation folder with each set of files
present as needed automatically list files in each directory for use.

**Take care you import a column with nuclei count separately if you
want.**

```{r parse_files}
DCCFiles <- dir(file.path(datadir, "dccs"), pattern = ".dcc$",
                full.names = TRUE, recursive = TRUE)
PKCFiles <- dir(file.path(datadir, "pkcs"), pattern = ".pkc$",
                full.names = TRUE, recursive = TRUE)
SampleAnnotationFile <-
  dir(file.path(datadir, "annotation"), pattern = "^[^~]",
      full.names = TRUE, recursive = TRUE)
```

# 2 load data

```{r load_data}
Data <-
  readNanoStringGeoMxSet(dccFiles = DCCFiles,
                         pkcFiles = PKCFiles,
                         phenoDataFile = SampleAnnotationFile,
                         phenoDataSheet = "Template",
                         phenoDataDccColName = "Sample_ID",
                         protocolDataColNames = c("aoi", "roi"),
                         experimentDataColNames = c("panel"))

#save data to prevent loading time for retakes
#saveData<-Data
#Data<-saveData

#change Data column names and manual correction of spelling errors
Data@phenoData@data[["slide_name"]]<-Data@phenoData@data[["slide name"]]
Data@phenoData@data[["slide name"]]<-  NULL


#+1 references the slide name column
ann_size<-length(colnames(Data@phenoData@data)[grepl("ANN",colnames(Data@phenoData@data))])+1 
ann_names<-c(colnames(Data@phenoData@data)[grepl("ANN",colnames(Data@phenoData@data))],"slide_name")

# Feel free to change the order of which colors are appointed.
color<-c("#A349A4", "#FFFF33", "#E7298A", "#091833", "#1B9E77", "#D95F02", "#7570B3",  "#66A61E", "#E6AB02", "#8DD3C7", "#9F000F", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F", "#377EB8", "#984EA3", "#4DAF4A", "#FF71CE", "#FF7F00", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928")

# Use count and count_max to set the range of color needed for each column.
# With setNames() the color code is linked to each unique individual value of each column.
count=1
color_list = list()
for (ann in ann_names) {
  count_max = count+length(unique(Data@phenoData@data[[ann]]))-1
  color_list[[ann]]<-setNames(color[count:count_max], unique(Data@phenoData@data[[ann]]))
  count=count_max+1
}
```

```{r color table}
value_names = c()
for (ann in ann_names) {
  value_names <- c(value_names, unique(Data@phenoData@data[[ann]]))
}

color_df <- as.data.frame(value_names)
color_df$color <- color[0:length(value_names)]

color_df %>% 
  mutate(color = fct_inorder(color)) |> 
  gt() %>% 
  data_color(columns = color, colors = as.character(color)) %>%
  tab_options(container.height = 500)
```

```{r}
paste("Reads from following runs used: ",unique(pData(protocolData(Data))$SeqSetId))
```

# 3 Study design

```{r annotate}
pkcs <- annotation(Data)
modules <- gsub(".pkc", "", pkcs)
kable(data.frame(PKCs = pkcs, modules = modules))
```

Select the annotations we want to show, use \`\` to surround column
names with spaces or special symbols

```{r select_annotations}
#count_mat <- dplyr::count(pData(Data), ANN1,ANN2,slide_name)
count_mat <- pData(Data) %>% dplyr::count(pData(Data)[c(ann_names)])
```

Simplify the slide names if required

```{r simplify_names}
# count_mat$slide_name <- gsub("disease", "d", gsub("normal", "n", count_mat$slide_name))
#count_mat$path_ann <- gsub("i", "", count_mat$path_ann) #correcting spelling error
```

Gather the data and plot in order: class, slide name, region, segment

```{r gather_data}
test_gr <- gather_set_data(count_mat, 1:all_of(ann_size))
test_gr$x <- factor(test_gr$x, labels = ann_names)
```

Plot Sankey

```{r Sankey_plot, fig.width=20,fig.height=11}
ggplot(test_gr, height = 10, width = 10, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = ANN2), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "white", size = 5) +
  theme_classic(base_size = 12) + 
  theme(legend.position = "bottom",
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_blank()) +
  scale_y_continuous(expand = expansion(0)) + 
  scale_x_discrete(expand = expansion(0)) +
  labs(x = "", y = "") +
  scale_fill_manual(values=color_list$ANN2) +
  annotate(geom = "segment", x = 4.25, xend = 4.25,
           y = 10, yend = 61, lwd = 2) +
  annotate(geom = "text", x = 4.19, y = 25, angle = 90, size = 5,
           hjust = 0.5, label = "50 segments")
```

# 4 QC & Pre-processing

Shift counts to one

```{r shift_counts}
#shift any expression counts with a value of 0 to 1 to enable in downstream transformations.
Data <- shiftCountsOne(Data, useDALogic = TRUE)
```

# 4.1 Segment QC

We first assess sequencing quality and adequate tissue sampling for
every ROI/AOI segment.

Every ROI/AOI segment will be tested for:

Raw sequencing reads: segments with \>1000 raw reads are removed. %
Aligned,% Trimmed, or % Stitched sequencing reads: segments below \~80%
for one or more of these QC parameters are removed. % Sequencing
saturation ([1-deduplicated reads/aligned reads]%): segments below \~50%
require additional sequencing to capture full sample diversity and are
not typically analyzed until improved. Negative Count: this is the
geometric mean of the several unique negative probes in the GeoMx panel
that do not target mRNA and establish the background count level per
segment; segments with low negative counts (1-10) are not necessarily
removed but may be studied closer for low endogenous gene signal and/or
insufficient tissue sampling. No Template Control (NTC) count: values
\>1,000 could indicate contamination for the segments associated with
this NTC; however, in cases where the NTC count is between 1,000-
10,000, the segments may be used if the NTC data is uniformly low (e.g.
0-2 counts for all probes). Nuclei: \>100 nuclei per segment is
generally recommended; however, this cutoff is highly study/tissue
dependent and may need to be reduced; what is most important is
consistency in the nuclei distribution for segments within the study.
Area: generally correlates with nuclei; a strict cutoff is not generally
applied based on area.

# 4.1.1 Select Segment QC

First, we select the QC parameter cutoffs, against which our ROI/AOI
segments will be tested and flagged appropriately. We have selected the
appropriate study-specific parameters for this study. Note: the default
QC values recommended above are advised when surveying a new dataset for
the first time.

Default QC cutoffs are commented in () adjacent to the respective
parameters study-specific values were selected after visualizing the QC
results in more detail below

```{r set_QC_params}
QC_params <-
  list(minSegmentReads = 1000, # Minimum number of reads (1000)
       percentTrimmed = 80,    # Minimum % of reads trimmed (80%)
       percentStitched = 80,   # Minimum % of reads stitched (80%)
       percentAligned = 75,    # Minimum % of reads aligned (80%)
       percentSaturation = 50, # Minimum sequencing saturation (50%)
       minNegativeCount = 1,   # Minimum negative control counts (10)
       maxNTCCount = 9000,     # Maximum counts observed in NTC well (1000)
       minNuclei = 20,        # Minimum # of nuclei estimated (100)
       minArea = 1000)         # Minimum segment area (5000)
Data <-
  setSegmentQCFlags(Data, qcCutoffs = QC_params)        

cat("pre-QC features:", dim(Data)[1], "\npre-QC samples:", dim(Data)[2])

#Table for clarification
QCparams_df <- data.frame (
  items = c("minSegmentReads","percentTrimmed","percentStitched","percentAligned","percentSaturation",
            "minNegativeCount","maxNTCCount","minNuclei","minArea"),
  defaults = c(1000,80,80,80,50,10,1000,100,5000),
  actual = c(QC_params$minSegmentReads,QC_params$percentTrimmed,QC_params$percentStitched,QC_params$percentAligned,QC_params$percentSaturation,QC_params$minNegativeCount,QC_params$maxNTCCount,QC_params$minNuclei,QC_params$minArea)
)

datatable(QCparams_df, rownames=FALSE,
          caption = "QC thresholds",
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )
)

```

Collate QC Results

```{r collate_QC_results}
QCResults <- protocolData(Data)[["QCFlags"]]
flag_columns <- colnames(QCResults)
QC_Summary <- data.frame(Pass = colSums(!QCResults[, flag_columns]),
                         Warning = colSums(QCResults[, flag_columns]))

QCResults$QCStatus <- apply(QCResults, 1L, function(x) {
  ifelse(sum(x) == 0L, "PASS", "WARNING")
})

QC_Summary["TOTAL FLAGS", ] <-
  c(sum(QCResults[, "QCStatus"] == "PASS"),
    sum(QCResults[, "QCStatus"] == "WARNING"))

col_by <- "ANN1"
col_by_plate <- "Plate_ID"
```

# 4.2 Graphical summaries of QC statistics {.tabset .tabset-pills}

Use the tab-menu to navigate!

```{r QC_plotting}
QC_histogram <- function(assay_data = NULL,
                         annotation = NULL,
                         fill_by = NULL,
                         thr = NULL,
                         scale_trans = NULL) {
  plt <- ggplot(assay_data,
                aes_string(x = paste0("unlist(`", annotation, "`)"),
                           fill = fill_by)) +
    geom_histogram(bins = 50) +
    geom_vline(xintercept = thr, lty = "dashed", color = "black") +
    theme_bw() + guides(fill = "none") +
    facet_wrap(as.formula(paste("~", fill_by)), nrow = 4) +
    scale_fill_manual(values=color_list$ANN1) +
    labs(x = annotation, y = "segments, #", title = annotation)
  if(!is.null(scale_trans)) {
    plt <- plt +
      scale_x_continuous(trans = scale_trans)
  }
  plt
}
```

## Trimmed

% Trimmed sequencing reads: segments below the QC parameters are removed.

```{r}
QC_histogram(sData(Data), "Trimmed (%)", col_by, QC_params$percentTrimmed)
```

## Stiched (%)

% Stiched sequencing reads: segments below the QC parameters are removed.

```{r}
QC_histogram(sData(Data), "Stitched (%)", col_by, QC_params$percentStitched)
```

## Aligned (%)

% Aligned sequencing reads: segments below the QC parameters are removed.

```{r}
QC_histogram(sData(Data), "Aligned (%)", col_by,QC_params$percentAligned)
```

## Sequencing Saturation (%) {.active}

% Sequencing saturation ([1-deduplicated reads/aligned reads]%): segments below the QC parameter require additional sequencing to capture full sample diversity and are not typically analyzed until improved.

```{r}
QC_histogram(sData(Data), "Saturated (%)", col_by, QC_params$percentSaturation) +
  labs(title = "Sequencing Saturation (%)",
       x = "Sequencing Saturation (%)")
```

## Area

Area: generally correlates with nuclei; a strict cutoff is not generally applied based on area.

```{r}
QC_histogram(sData(Data), "area", col_by, QC_params$minArea, scale_trans = "log10")
```

## Nuclei count

Nuclei: >100 nuclei per segment is generally recommended; however, this cutoff is highly study/tissue dependent and may need to be reduced; what is most important is consistency in the nuclei distribution for segments within the study.

```{r}
QC_histogram(sData(Data), "nuclei", col_by, QC_params$minNuclei)
```

## DuplicationRate

Show the percentage of duplicated reads per raw reads.

```{r}
ggplot(pData(protocolData(Data)),
       aes(x = Plate_ID, fill=Plate_ID,
          y = (DeduplicatedReads/Raw))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Deduplicated / Raw reads") +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()
```

## Negprobes vs Endogenous

Compare Negprobes vs Endogenous expression to see if the endogenous (real) probes 
are significantly higher than Negprobes. This means that there is a 
high signal-to-noise ratio.

```{r plot_negprobe_data, fig.width=10,fig.height=5}
tmp_target_Data <- aggregateCounts(Data)

#get negative probe data
negs<-subset(tmp_target_Data,CodeClass=="Negative")

p1<-ggplot(pData(negs),
       aes(x = ANN2, fill = ANN2,
          y = assayDataElement(negs, elt = "exprs"))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Negative probes Expression") +
  scale_y_continuous(limits = c(1,3000), trans = "log2") +
  theme_bw() + coord_flip()


# get endogenous probe data
end<-subset(tmp_target_Data,CodeClass=="Endogenous")

p2<-ggplot(pData(end),
       aes(x = ANN2, fill = ANN2,
           y = colMeans(assayDataElement(end, elt = "exprs")))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Endogenous probes Expression (mean)") +
  scale_y_continuous(limits = c(1,3000),trans = "log2") +
  theme_bw() + coord_flip()

pl <-list(p1,p2)
plot_grid(plotlist=pl, nrow=2, align='v')

```

## Neg_probe reads compared to raw_reads

Show the difference between deduplicated reads and negative control reads. It is expected that deduplicated reads are substantially higher than negative control reads.

```{r}

# make background total neg probe count
fdata_df<-fData(Data)
negprobesnames<-rownames(fdata_df[fdata_df$Negative==TRUE,])
temp_exp<-assayDataElement(Data,elt='exprs')
negprobe_expr_fd<-temp_exp[rownames(temp_exp) %in% negprobesnames,]
tot_neg_ctrl_reads<-colSums(negprobe_expr_fd)
tot_dedup_reads<-pData(protocolData(Data))$DeduplicatedReads

df<-data.frame('aoi'= names(tot_neg_ctrl_reads),'tot_dedup_reads' = as.numeric(tot_dedup_reads),'tot_neg_ctrl_reads'=as.numeric(tot_neg_ctrl_reads))
df<-melt(df,id="aoi")
ggplot(df,aes(fill=variable,y=value,x=aoi)) + 
  geom_bar(position="identity",stat="identity") +
  scale_y_continuous(trans = log2_trans()) +
  theme(legend.position="bottom",axis.text.x = element_blank(),axis.ticks.x=element_blank())                     
 
```

## Duplicated reads vs Background

Compare duplicated reads and background noise to show that the reads significantly duplicated and thus checked to avoid the questioning about the validity of the reads.

```{r}
# get dcc per plate. sum negprobe counts/dcc/plate
ggplot(pData(protocolData(Data)),
       aes(x = Plate_ID, fill=Plate_ID,
          y = DeduplicatedReads)) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = "Deduplicated / Raw reads") +
  scale_y_log10()+
  geom_hline(data =pData(protocolData(Data)) , 
           aes(yintercept = NTC, colour=Plate_ID)) +
  theme_bw()

```

## Duplicated reads vs ROIarea

See if all ROIs contain duplicated reads for validity.

```{r, fig.width=15,fig.height=5}
temp_df<-cbind(pData(Data),pData(protocolData(Data)),dcc=rownames(pData(Data)))

ggplot(temp_df,
       aes(x = dcc, colour=slide_name,
          y = (DeduplicatedReads/area) )) +
  geom_point() +
  ylim(0,20) + 
  labs(y = "Deduplicated reads / ROI area") +
  theme(axis.text.x = element_text(size =6, angle=90, hjust=1) )

```

## Duplicated reads vs nuclei

Show the amount of deduplicated reads per nuclei.

```{r, fig.width=15,fig.height=5}
temp_df<-cbind(pData(Data),pData(protocolData(Data)),dcc=rownames(pData(Data)))

ggplot(temp_df,
       aes(x = dcc, colour=slide_name,
          y = (DeduplicatedReads/nuclei) )) +
  geom_point() +
  ylim(0,50) +                                                      # Adjust per project
  labs(y = "Deduplicated reads / nuclei") +
  theme(axis.text.x = element_text(size =6, angle=90, hjust=1) )

```

# 4.3 Process Negative GeoMeans

Negative Count: this is the geometric mean of the several unique negative probes in the GeoMx panel that do not target mRNA and establish the background count level per segment; segments with low negative counts (1-10) are not necessarily removed but may be studied closer for low endogenous gene signal and/or insufficient tissue sampling.

```{r}
# Calculate the negative geometric means for each module
# It will show only the negative probes geomean, so expect less segments.
negativeGeoMeans <- 
  esBy(negativeControlSubset(Data), 
       GROUP = "Module", 
       FUN = function(x) { 
         assayDataApply(x, MARGIN = 2, FUN = ngeoMean, elt = "exprs") 
       }) 
protocolData(Data)[["NegGeoMean"]] <- negativeGeoMeans

negCols <- paste0("NegGeoMean_", modules)
pData(Data)[, negCols] <- sData(Data)[["NegGeoMean"]]
for(ann in negCols) {
  plt <- QC_histogram(pData(Data), ann, col_by, 2, scale_trans = "log10")
  print(plt)
}


# Detatch neg_geomean columns ahead of aggregateCounts call

pData(Data) <- pData(Data)[, !colnames(pData(Data)) %in% negCols]

```

Show all NTC values, Freq = \# of Segments with a given NTC count:

```{r QC_tables}
QC<-sData(Data)

ntc_df <-QC[,c("slide_name","Plate_ID","NTC_ID","NTC")]
temptable<-ntc_df %>% dplyr::count(ntc_df$slide_name,ntc_df$NTC_ID,ntc_df$Plate_ID,ntc_df$NTC)
colnames(temptable) <- c("Slide_name","NTC_ID","Plate_ID","NTC_count","Number_of_samples")
datatable(temptable, rownames = FALSE)


kable(table(NTC_Count = sData(Data)$NTC), col.names = c("NTC Count", "# of Segments"))

kable(QC_Summary, caption = "QC Summary Table for each Segment")

datatable(QC_Summary,
          caption = "AOI QC Summary",
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )
)
```

Show AOIs which fail critical QCs.

```{r list_failures}
QC<-sData(Data)
undersat<-subset(QC, `Saturated (%)`<= QC_params$percentSaturation)

if(nrow(undersat)> 0) {

datatable(aggregate(undersat, by=list(undersat$SampleID),paste,collapse=";"),
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )
)}
```
Show the reasons why the AOIs fail critical QCs.

```{r info_list_failures}
if(nrow(undersat)> 0) {

datatable(aggregate(undersat$QCFlags, by=list(undersat$SampleID),paste,collapse=";"),
          extensions = 'Buttons', options = list (
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          )) %>%
    formatStyle(columns = colnames(undersat$QCFlags), backgroundColor = styleEqual("TRUE", "#850000"), color = styleEqual("TRUE", "white")) 
  }
```

Subsetting our dataset has removed samples which did not pass QC

```{r subsetting_QC_fails}
Data <- Data[, QCResults$QCStatus == "PASS"]
```

Generally keep the qcCutoffs parameters unchanged. Set
removeLocalOutliers to FALSE if you do not want to remove local outliers

```{r process_QC}
Data <- setBioProbeQCFlags(Data, 
                               qcCutoffs = list(minProbeRatio = 0.1,
                                                percentFailGrubbs = 20), 
                               removeLocalOutliers = FALSE)

ProbeQCResults <- fData(Data)[["QCFlags"]]
```

Define QC table for Probe QC

```{r define_qc_table}
qc_df <- data.frame(Passed = sum(rowSums(ProbeQCResults[, -1]) == 0),
                    Global = sum(ProbeQCResults$GlobalGrubbsOutlier),
                    Local = sum(rowSums(ProbeQCResults[, -2:-1]) > 0
                                & !ProbeQCResults$GlobalGrubbsOutlier))
```

Subset object to exclude all that did not pass Ratio & Global testing

```{r subset}
ProbeQCPassed <- 
  subset(Data, 
         fData(Data)[["QCFlags"]][,c("LowProbeRatio")] == FALSE &
           fData(Data)[["QCFlags"]][,c("GlobalGrubbsOutlier")] == FALSE)

Data <- ProbeQCPassed 
cat("After QC features:", dim(Data)[1], "\nAfter QC samples:", dim(Data)[2])
```

Check how many unique targets the object has

```{r unique_check}
length(unique(featureData(Data)[["TargetName"]]))
```

Collapse to targets

```{r collaps_targets}
target_Data <- aggregateCounts(Data)

exprs(target_Data)[1:5, 1:2]
```

Define LOQ SD threshold and minimum value

```{r set_LSQ}
cutoff <- 2
minLOQ <- 2
```

# 4.4 Limit of Quantification

We define a limit of quantification (LOQ) per ROI/AOI segment based on
the negative control probes to guide the filtering of segments and genes
with low signal relative to background. The formula for calculating the
LOQ in the $i^{th}$ segment at $n$ standard deviations ($n = 2$ for this
study) is: $LOQ_i=geomean(NegProbe_i)*geoSD(NegProbe_i)^n$

Calculate LOQ per module tested

```{r calculate_LOQ}
LOQ <- data.frame(row.names = colnames(target_Data))
for(module in modules) {
  vars <- paste0(c("NegGeoMean_", "NegGeoSD_"),
                 module)
  if(all(vars[1:2] %in% colnames(pData(target_Data)))) {
    LOQ[, module] <-
      pmax(minLOQ,
           pData(target_Data)[, vars[1]] * 
             pData(target_Data)[, vars[2]] ^ cutoff)
  }
}
pData(target_Data)$LOQ <- LOQ
```

# 4.5 Filtering

After determining the limit of quantification (LOQ) per segment, we
recommend filtering out either segments and/or genes with abnormally low
signal. Filtering is an important step to focus on the true biological
data of interest.

We determine the number of genes detected in each segment across the
dataset.

```{r filtering}
LOQ_Mat <- c()
for(module in modules) {
  ind <- fData(target_Data)$Module == module
  Mat_i <- t(esApply(target_Data[ind, ], MARGIN = 1,
                     FUN = function(x) {
                       x > LOQ[, module]
                     }))
  LOQ_Mat <- rbind(LOQ_Mat, Mat_i)
}
# ensure ordering since this is stored outside of the geomxSet
LOQ_Mat <- LOQ_Mat[fData(target_Data)$TargetName, ]
```

# 4.5.1 Segment Gene Detection

We first filter out segments with exceptionally low signal. These
segments will have a small fraction of panel genes detected above the
LOQ relative to the other segments in the study. Let's visualize the
distribution of segments with respect to their % genes detected:

Save detection rate information to pheno data

```{r save_detectino_rate}
pData(target_Data)$GenesDetected <- 
  colSums(LOQ_Mat, na.rm = TRUE)
pData(target_Data)$GeneDetectionRate <-
  pData(target_Data)$GenesDetected / nrow(target_Data)

```

Determine detection thresholds: 1%, 5%, 10%, 15%, \>15%

```{r determine+thresholds}
pData(target_Data)$DetectionThreshold <- 
  cut(pData(target_Data)$GeneDetectionRate,
      breaks = c(0, 0.01, 0.05, 0.1, 0.15, 0.2,1),
      labels = c("<1%", "1-5%", "5-10%", "10-15%", "15-20%", ">20%"))

# stacked bar plot of different cut points (1%, 5%, 10%, 15%)
ggplot(pData(target_Data),
       aes(x = DetectionThreshold)) +
  geom_bar(aes(fill = ANN2)) +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_bw() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  scale_fill_manual(values=color_list$ANN2) +
  labs(x = "Gene Detection Rate",
       y = "Segments, #",
       fill = "Segment Type")
```

cut percent genes detected at 1, 5, 10, 15

```{r cut_to_percent}
kable(table(pData(target_Data)$DetectionThreshold,
            pData(target_Data)$ANN2))

# set threshold for detectionlevel
# default 0.1
gene_det_threshold <- 0.05

target_Data <-
  target_Data[, pData(target_Data)$GeneDetectionRate >= gene_det_threshold]

dim(target_Data)
```

# 4.6 Manual removal of samples/classes

```{r remove_samples}
active_aois<-names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))
```

re-Collect annotations

```{r collect_annotations}
# gather the data and plot in order: class, slide name, region, segment
#count_mat <- dplyr::count(pData(Data), ANN1,ANN2,ANN3,ANN4,slide_name)

temp_qc <- temp_df
temp_qc$QCResult <- QCResults$QCStatus
temp_qc$QCResult[temp_qc$QCResult == "WARNING"] <- "X"

#count_mat <- dplyr::count(a, ANN1, ANN2, slide_name, QCResult)
count_mat <- temp_qc %>% dplyr::count(temp_qc[c(ann_names, "QCResult")])

test_gr <- gather_set_data(count_mat, 1:(ann_size+1))
test_gr$x <- factor(test_gr$x, labels = c(ann_names, "QCResult"))
```

re-Plot Sankey

```{r plot_sankey, fig.width=20,fig.height=11}
ggplot(test_gr, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = ANN2), alpha = 0.5, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2) +
  geom_parallel_sets_labels(color = "white", size = 5) +
  theme_classic(base_size = 17) + 
  theme(legend.position = "bottom",
        axis.ticks.y = element_blank(),
        axis.line = element_blank(),
        axis.text.y = element_blank()) +
  scale_y_continuous(expand = expansion(0)) + 
  scale_x_discrete(expand = expansion(0)) +
  scale_fill_manual(values=color_list$ANN2) +
  labs(x = "", y = "") +
  annotate(geom = "segment", x = 3.25, xend = 3.25, y = 10, 
           yend = 60, lwd = 2) +
  annotate(geom = "text", x = 3.19, y = 25, angle = 90, size = 5,
           hjust = 0.5, label = "50 segments")
```

# 4.7 Gene Detection Rate

Calculate detection rate

```{r clc_detection_rate}
LOQ_Mat <- LOQ_Mat[, colnames(target_Data)]
fData(target_Data)$DetectedSegments <- rowSums(LOQ_Mat, na.rm = TRUE)
fData(target_Data)$DetectionRate <-
  fData(target_Data)$DetectedSegments / nrow(pData(target_Data))
```

Gene of interest detection table

```{r gene_of_interest_table}
goi <- c("PDCD1", "CD274", "IFNG", "CD8A", "CD68", "EPCAM",
         "KRT18", "NPHS1", "NPHS2", "CALB1", "CLDN8")
goi_df <- data.frame(
  Gene = goi,
  Number = fData(target_Data)[goi, "DetectedSegments"],
  DetectionRate = percent(fData(target_Data)[goi, "DetectionRate"]))
```

# 4.8 Gene Filtering

We will graph the total number of genes detected in different
percentages of segments. Based on the visualization below, we can better
understand global gene detection in our study and select how many low
detected genes to filter out of the dataset. Gene filtering increases
performance of downstream statistical tests and improves interpretation
of true biological signal.

Plot detection rate

```{r plot_det_rate}
plot_detect <- data.frame(Freq = c(1, 5, 10, 20, 30, 50))

# Manual gene cut-off
cutoff <- 5
cutoff <- which(sapply(plot_detect$Freq, FUN=function(X) cutoff %in% X)) + 0.5 # Select step from plot_detect. 1 = 1, 5 = 2, 10 = 3, etc. +0.5 to dodge bin.

plot_detect$Number <-
  unlist(lapply(c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5),
                function(x) {sum(fData(target_Data)$DetectionRate >= x)}))
plot_detect$Rate <- plot_detect$Number / nrow(fData(target_Data))
rownames(plot_detect) <- plot_detect$Freq

ggplot(plot_detect, aes(x = as.factor(Freq), y = Rate, fill = Rate)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = formatC(Number, format = "d", big.mark = ",")),
            vjust = 1.6, color = "black", size = 4) +
  
  geom_vline(xintercept = 3.5, color = "red") +
  geom_vline(xintercept = cutoff, color = "red") +
  annotate(x=3.5,y=0.1,label="Default cut-off",size=3,geom="label") +
  annotate(x=cutoff,y=0.1,label="Manual cut-off",size=3,geom="label") +
  
  scale_fill_gradient2(low = "orange2", mid = "lightblue",
                       high = "dodgerblue3", midpoint = 0.65,
                       limits = c(0,1),
                       labels = scales::percent) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0,1),
                     expand = expansion(mult = c(0, 0))) +
  labs(x = "% of Segments",
       y = "Genes Detected, % of Panel > LOQ")
```

Subset to target genes detected in at least a manual percentage of the samples with a default of 10%. Also
manually include the negative control probe, for downstream use.

```{r subset_to_10p_detected_genes}
# default=0.1
negativeProbefData <- subset(fData(target_Data), CodeClass == "Negative")
neg_probes <- unique(negativeProbefData$TargetName)
target_Data <- 
  target_Data[fData(target_Data)$DetectionRate >= 0.05 |
                    fData(target_Data)$TargetName %in% neg_probes, ]

# retain only detected genes of interest
goi <- goi[goi %in% rownames(target_Data)]

dim(target_Data)
```

# 5 Normalization

We will now normalize the GeoMx data for downstream visualizations and
differential expression. The two common methods for normalization of
DSP-NGS RNA data are i) quartile 3 (Q3) or ii) background normalization.

Both of these normalization methods estimate a normalization factor per
segment to bring the segment data distributions together. More advanced
methods for normalization and modeling are under active development.
However, for most studies, these methods are sufficient for
understanding differences between biological classes of segments and
samples.

Q3 normalization is typically the preferred normalization strategy for
most DSP-NGS RNA studies. Given the low negative probe counts in this
particular dataset as shown during Segment QC, we would further avoid
background normalization as it may be less stable.

Before normalization, we will explore the relationship between the upper
quartile (Q3) of the counts in each segment with the geometric mean of
the negative control probes in the data. Ideally, there should be a
separation between these two values to ensure we have stable measure of
Q3 signal. If you do not see sufficient separation between these values,
you may consider more aggressive filtering of low signal segments/genes.

Q3 and quantile normalization are radically different in their approach. 
Q3 normalization relies a normalization factor that aligns the 
third quartile gene count value for all samples. This method does not address 
potential global variations in data distributions or expression ranges. 
In contrast, quantile normalization, which is widely used to normalize gene 
expression data derived from microarrays, aligns count values according to 
the rank of the genes and forces the data of all samples into the same 
distribution. It is therefore inherent to this method that the difference 
in count value range or signal-to-noise ratio is no longer dependent on 
raw data distributions[1].

note that quantile normalization, due to its course method of normalization, 
may limit the detection of more subtle differences in gene expression.

Graph Q3 value vs negGeoMean of Negatives

```{r lpot_q3_negGeoMean}
ann_of_interest <- "ANN2"
Stat_data <- 
  data.frame(row.names = colnames(exprs(target_Data)),
             Segment = colnames(exprs(target_Data)),
             Annotation = pData(target_Data)[, ann_of_interest],
             Q3 = unlist(apply(exprs(target_Data), 2,
                               quantile, 0.75, na.rm = TRUE)),
             NegProbe = exprs(target_Data)[neg_probes, ])
Stat_data_m <- melt(Stat_data, measure.vars = c("Q3", "NegProbe"),
                    variable.name = "Statistic", value.name = "Value")

plt1 <- ggplot(Stat_data_m,
               aes(x = Value, fill = Statistic)) +
  geom_histogram(bins = 40) + theme_bw() +
  scale_x_continuous(trans = "log2") +
  facet_wrap(~Annotation, nrow = 1) + 
  scale_fill_brewer(palette = 3, type = "qual") +
  labs(x = "Counts", y = "Segments, #")

plt2 <- ggplot(Stat_data,
               aes(x = NegProbe, y = Q3, color = Annotation)) +
  geom_abline(intercept = 0, slope = 1, lty = "dashed", color = "darkgray") +
  geom_point() + guides(color = "none") + theme_bw() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2") +
  theme(aspect.ratio = 1) +
  labs(x = "Negative Probe GeoMean, Counts", y = "Q3 Value, Counts")

plt3 <- ggplot(Stat_data,
               aes(x = NegProbe, y = Q3 / NegProbe, color = Annotation)) +
  geom_hline(yintercept = 1, lty = "dashed", color = "darkgray") +
  geom_point() + theme_bw() +
  scale_x_continuous(trans = "log2") + 
  scale_y_continuous(trans = "log2") +
  theme(aspect.ratio = 1) +
  labs(x = "Negative Probe GeoMean, Counts", y = "Q3/NegProbe Value, Counts")

btm_row <- plot_grid(plt2, plt3, nrow = 1, labels = c("B", ""),
                     rel_widths = c(0.43,0.57))
plot_grid(plt1, btm_row, ncol = 1, labels = c("A", ""))

```

Q3 norm (75th percentile) for WTA/CTA with or without custom spike-ins

```{r q3_norm}
target_Data <- normalize(target_Data ,
                             norm_method = "quant", 
                             desiredQuantile = .75,
                             toElt = "q_norm")
#, data_type = "RNA" depricated after 4.1

```

Quantile Normalization

```{r quantile}
quantile <- normalize.quantiles(target_Data@assayData[["exprs"]])
rownames(quantile) <- rownames(target_Data@assayData[["exprs"]])
colnames(quantile) <- colnames(target_Data@assayData[["exprs"]])

assayDataElement(object = target_Data, elt = "quantile_norm") <-  quantile
```

Background normalization for WTA/CTA without custom spike-in

```{r bg_norm}
target_Data <- normalize(target_Data,
                             norm_method = "neg", 
                             fromElt = "exprs",
                             toElt = "neg_norm")

# , data_type = "RNA" depricated after 4.1
```

Overwrite the dafault q_norm with another normalization method if needed

```{r overwrite q_norm}
# Normalization options are: default "q_norm", "neg_norm" or "quantile_norm".
assayDataElement(object = target_Data, elt = "q_norm") <- assayDataElement(object = target_Data, elt = "q_norm") 
```

# 5.1 Visualize the first 10 segments with each normalization method {.tabset .tabset-pills}

Use the tab-menu to navigate!

```{r visulaize_norms}

#Fix zero values, which go to -inf in log transform in standard boxplot
# temp <-as.matrix(exprs((target_Data)[,1:10]))
# long <- melt(temp)
# colnames(long) <- c("gene","segment","count")
# ggplot(long, aes(x=segment,y=count)) +
#     geom_boxplot(fill="#9EDAE5") +
#     scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
#     scale_x_discrete(labels=c(1:10)) +
#     labs(title="Raw counts", x="segment", y = "Counts, Raw")
# 
# 
# temp <-as.matrix(assayDataElement(target_Data[,1:10], elt = "q_norm"))
# long <- melt(temp)
# colnames(long) <- c("gene","segment","count")
# ggplot(long, aes(x=segment,y=count)) +
#     geom_boxplot(fill = "#2CA02C") +
#     scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
#     scale_x_discrete(labels=c(1:10)) +
#     labs(title="Q3 Norm Counts", x="segment", y = "Counts, Q3 Normalized")
# 
# 
# temp <-as.matrix(assayDataElement(target_Data[,1:10], elt = "neg_norm"))
# long <- melt(temp)
# colnames(long) <- c("gene","segment","count")
# ggplot(long, aes(x=segment,y=count)) +
#     geom_boxplot(fill = "#FF7F0E") +
#     scale_y_continuous(trans=scales::pseudo_log_trans(base = 10)) +
#     scale_x_discrete(labels=c(1:10)) +
#     labs(title="Neg Norm Counts", x="segment", y = "Counts, Neg. Normalized")
```

## raw counts

```{r}
boxplot(exprs(target_Data)[,1:8],
        col = "#9EDAE5", main = "Raw Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Raw")
```

## Q3 normalized {.active}

Q3 (3rd quartile of all selected targets) is the recommended normalization method for NGS data for all targets that are above the limit of quantitation (LOQ). Q3 normalization uses the top 25% of expressers to normalize across ROIs/segments, so it is robust to changes in expression of individual genes and ideal for making comparisons across ROIs/segments.

```{r}
boxplot(assayDataElement(target_Data[,1:8], elt = "q_norm"),
        col = "#2CA02C", main = "Q3 Norm Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Q3 Normalized")
```

## Quantile normalized (testing phase)

The quantile normalization involves first ranking the gene of each sample by magnitude, calculating the average value for genes occupying the same rank, and then substituting the values of all genes occupying that particular rank with this average value. The next step is to reorder the genes of each sample in their original order[2].

```{r}
boxplot(assayDataElement(target_Data[,1:8], elt = "quantile_norm"),
        col = "pink", main = "Quantile Norm Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Quantile Normalized")
```

## Negative probe normalization

For each sample, an RNA-probe-pool-specific negative probe normalization factor was generated on the basis of the geometric mean of negative probes in each pool.

```{r}
boxplot(assayDataElement(target_Data[,1:8], elt = "neg_norm"),
        col = "#FF7F0E", main = "Neg Norm Counts",
        log = "y", names = 1:8, xlab = "Segment",
        ylab = "Counts, Neg. Normalized")

```

# 6 Unsupervised Analysis

# 6.1 UMAP {.tabset .tabset-pills}

Use the tab-menu to navigate!

## 1

```{r umap, fig.width=10,fig.height=8}
custom_umap <- umap::umap.defaults
custom_umap$random_state <- 42
# run UMAP

umap_out <-
  umap(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
       config = custom_umap)
pData(target_Data)[, c("UMAP1", "UMAP2")] <- umap_out$layout[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = slide_name, shape = ANN1)) +

  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()
```

## 2

```{r umap2, fig.width=10,fig.height=8}
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = ANN3, shape = ANN1)) +

  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()
```

# 6.2 Run tSNE {.tabset .tabset-pills}

Use the tab-menu to navigate!

One common approach to understanding high-plex data is dimension
reduction. Two common methods are UMAP and tSNE, which are
non-orthogonally constrained projections that cluster samples based on
overall gene expression. In this study, we see by either UMAP (from the
umap package) or tSNE (from the Rtsne package)

## 1

```{r tSNE, fig.width=10,fig.height=8}
tsne_out <-
  Rtsne(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
        perplexity = ncol(target_Data)*.15)
pData(target_Data)[, c("tSNE1", "tSNE2")] <- tsne_out$Y[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = tSNE1, y = tSNE2, shape = slide_name, color = ANN2)) +
  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()
```

## 2

```{r, fig.width=10,fig.height=8}
tsne_out <-
  Rtsne(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
        perplexity = ncol(target_Data)*.15)
pData(target_Data)[, c("tSNE1", "tSNE2")] <- tsne_out$Y[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = tSNE1, y = tSNE2, color = ANN3, shape = ANN1)) +
  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()
```

If no bias is observed set batch correction needed on FALSE, otherwise TRUE.

```{r check_batch_correction}
batch_correction_needed = FALSE
```

# 6.3 Batch correction {.tabset .tabset-pills}

If the data is observed to have a batch effect it can be corrected with the methods: RUV4, LIMMA, or Combat-Seq. Each method is done and the best one is picked and used.

The standR package is short for Spatial transcriptomics analyzes and decoding in R, it aims at providing good practice pipeline and useful functions for users to analyze Nanostring’s GeoMx DSP data. In the Nanostring’s GeoMX DSP protocol, due to the fact that one slide is only big enough for a handful of tissue segments (ROIs), it is common that we see the DSP data being confounded by the batch effect introduced by different slides. In order to establish fair comparison between ROIs later on, it is necessary to remove this batch effect from the data. (https://bioconductor.org/packages/release/bioc/vignettes/standR/inst/doc/standR_introduction.html)

For RUV4 correction, the function is requiring 3 parameters other than the input object, including factors: the factor of interest, i.e. the biological variation we plan to keep; NCGs: the list of negative control genes detected using the function findNCGs; and k: is the number of unwanted factors to use, in the RUV documentation, it is suggest that we should use the smallest k once we don’t observe technical variation in the data.

Another option is set the parameter method to “Limma”, which uses the remove batch correction method from limma. In this mode, the function is requiring 2 parameters, including batch: a vector that indicating batches for all samples; and design: a design matrix which is generated by model.matrix, in the design matrix, all biologically-relevant factors should be included.

ComBat-seq is a batch effect adjustment tool for bulk RNA-seq count data. It is an improved model based on the popular ComBat, to address its limitations through novel methods designed specifically for RNA-Seq studies. ComBat-seq takes untransformed, raw count matrix as input. Same as ComBat, it requires a known batch variable. (https://github.com/zhangyuqing/ComBat-seq)

## Set up standR object

```{r batch correction, eval=batch_correction_needed}
count_geomx <- as.data.frame(target_Data@assayData[["exprs"]])

sample_geomx <- target_Data@phenoData@data
sample_geomx$roi <- target_Data@protocolData@data$roi
sample_geomx <- as.data.frame(sample_geomx)
sample_geomx$Sample_ID <- rownames(sample_geomx)
sample_geomx$SegmentDisplayName <- paste(sample_geomx$`scan name`, sample_geomx$roi, sample_geomx$segment, sep = " | ")
sample_geomx$ROICoordinateX <- 1
sample_geomx$ROICoordinateY <- 1


feature_geomx <- fData(Data)
feature_geomx <- feature_geomx[c("RTS_ID", "TargetName", "ProbeID", "Negative")]
feature_geomx <- as.data.frame(feature_geomx)
rownames(feature_geomx) <- NULL

matching <- sample_geomx$SegmentDisplayName[sample_geomx$Sample_ID %in% colnames(count_geomx)]
colnames(count_geomx) <- matching
count_geomx$TargetName <- rownames(count_geomx)
rownames(count_geomx) <- NULL

spe <- readGeoMx(count_geomx, sample_geomx, featureAnnoFile = feature_geomx, hasNegProbe = TRUE)

colData(spe)$regions <- paste0(colData(spe)$ANN2,"_",colData(spe)$ANN1) |> 
  (\(.) gsub("_Geometric Segment","",.))() |>
  paste0("_",colData(spe)$pathology) |>
  (\(.) gsub("_NA","_ns",.))()

colData(spe)$regions <- paste0(colData(spe)$ANN2,"_",colData(spe)$ANN1) |> 
  (\(.) gsub("_Geometric Segment","",.))() |>
  paste0("_",colData(spe)$pathology) |>
  (\(.) gsub("_NA","_ns",.))()
colData(spe)$biology <- paste0(colData(spe)$regions)
```

## See optimal k value for RUV4

```{r, eval=batch_correction_needed}
spe <- findNCGs(spe, batch_name = "slide_name", top_n = 500)
findBestK(spe, maxK = 10, factor_of_int = "biology", NCGs = metadata(spe)$NCGs, factor_batch = "slide_name")
```

## RUV4

```{r ruv4 batch, fig.width=10,fig.height=10, eval=batch_correction_needed}
ruv4 <- geomxBatchCorrection(spe, factors = "biology", 
                   NCGs = metadata(spe)$NCGs, k = 1)

plotPairPCA(ruv4, assay = 2, color = slide_name, shape = regions, title = "RUV4 removeBatch")

plotRLExpr(ruv4, assay = 2, color = `slide_name`) + ggtitle("RUV4 removeBatch")
```

## Limma

```{r limma batch,  fig.width=10,fig.height=10, eval=batch_correction_needed}
limma <- geomxBatchCorrection(spe,
                       batch = colData(spe)$`slide_name`, method = "Limma",
                       design = model.matrix(~ 0 + ANN1 + regions, 
                                             data = colData(spe)))

plotPairPCA(limma, assay = 2, color = slide_name, shape = regions, title = "Limma removeBatch")

plotRLExpr(limma, assay = 2, color = slide_name) + ggtitle("Limma removeBatch")
```

## CombatSeq

```{r, eval=batch_correction_needed}
adjusted <- ComBat_seq(target_Data@assayData[["exprs"]], batch=target_Data@phenoData@data[["slide_name"]], group=target_Data@phenoData@data[["ANN2"]])
assayDataElement(object = target_Data, elt = "combat") <-  adjusted

umap_out2 <-
  umap(t(log2(assayDataElement(target_Data , elt = "combat"))),
       config = custom_umap)
pData(target_Data)[, c("UMAP1", "UMAP2")] <- umap_out2$layout[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = slide_name, shape = ANN1)) +

  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw() + ggtitle("CombatSeq removeBatch")
```

## Compare limma vs RUV4

```{r compare batch, eval=batch_correction_needed}
spe_list <- list(spe, ruv4, limma)

plotClusterEvalStats(spe_list = spe_list,
                     bio_feature_name = "regions",
                     batch_feature_name = "slide_name",
                     data_names = c("Raw","RUV4","Limma"))
```

## Add batch correction result to target_Data

```{r add batch correction, eval=batch_correction_needed}
neg_probes_save <- t(as.matrix(target_Data@assayData$q_norm["NegProbe-WTX",]))
rownames(neg_probes_save) <- "NegProbe-WTX"

# Depending on correct method, change the word "limma" to "ruv4" or vice versa.
limma <-limma@assays@data@listData[["logcounts"]]
limma <- rbind(limma, neg_probes_save)
colnames(limma) <- colnames(as.data.frame(target_Data@assayData[["q_norm"]]))
assayDataElement(object = target_Data, elt = "limma") <-  limma

ruv4 <-ruv4@assays@data@listData[["logcounts"]]
ruv4 <- rbind(ruv4, neg_probes_save)
colnames(ruv4) <- colnames(as.data.frame(target_Data@assayData[["q_norm"]]))
assayDataElement(object = target_Data, elt = "ruv4") <-  ruv4
```

## Choose method {.active}

```{r}
# choose method to replace q_norm or set it to ""
method <- ""

# replace q_norm with chosen method
if (!method == "") {
  # save orginal q_norm
  assayDataElement(object = target_Data, elt = "original") <-  assayDataElement(object = target_Data, elt = "q_norm")
  
  assayDataElement(object = target_Data, elt = "q_norm") <-  assayDataElement(object = target_Data, elt = method)
  print(paste("Batch correction method:", method))
} else {print("No batch correction needed")}
```

## Umap after batch correction

```{r, fig.width=10,fig.height=8, eval=batch_correction_needed}
umap_out <-
  umap(t(log2(assayDataElement(target_Data , elt = "q_norm"))),
       config = custom_umap)
pData(target_Data)[, c("UMAP1", "UMAP2")] <- umap_out$layout[, c(1,2)]
ggplot(pData(target_Data),
       aes(x = UMAP1, y = UMAP2, color = slide_name, shape = ANN1)) +
  geom_point(size = 3) +
  #geom_text_repel(aes(label=row.names(pData(target_Data))), size=2,max.overlaps = 100)+
  theme_bw()
```

# 6.4 Clustering high CV Genes {.tabset .tabset-pills}

Another approach to explore the data is to calculate the coefficient of
variation ($CV$) for each gene ($g$) using the formula
$CV_g=SD_g/mean_g$. We then identify genes with high CVs that should
have large differences across the various profiled segments. This
unbiased approach can reveal highly variable genes across the study.

We plot the results using unsupervised hierarchical clustering,
displayed as a heatmap.

```{r clustering1}
# create a log2 transform of the data for analysis
assayDataElement(object = target_Data, elt = "log_q") <-
  assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")

# create CV function
calc_CV <- function(x) {sd(x) / mean(x)}
CV_dat <- assayDataApply(target_Data,
                         elt = "log_q", MARGIN = 1, calc_CV)
# show the highest CD genes and their CV values
sort(CV_dat, decreasing = TRUE)[1:5]
```

## Table of CV values

```{r}
# show the highest CD genes and their CV values
datatable(as.data.frame(CV_dat),
          extensions = 'Buttons', options = list (
            order = list(1, 'desc'),
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          ), caption = "CV values of genes" 
) %>% formatRound(columns=c("CV_dat"), digits=3)

```

## Heatmap genes in the top 3rd of the CV values {.active}

For heatmap clustering methods see: 
General heatmap info:
https://www.rdocumentation.org/packages/pheatmap/versions/1.0.12/topics/pheatmap
Distance methods further explained:
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist
clustering methods further explained:
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust

```{r CVheatmap, fig.width=20,fig.height=15}
# choose clustering methods for heatmaps
clust_method <- "average"    # options: average, ward.D, ward.D2, single, complete, mcquitty, median and centroid.
# choose clustering distance methods for heatmaps
dist_method <- "correlation" # options: correlation, euclidean, maximum, manhattan, canberra, binary and minkowski.

GOI <- names(CV_dat)[CV_dat > quantile(CV_dat, 0.75)]

pheatmap(assayDataElement(target_Data[GOI, ], elt = "log_q"),
         scale = "row",
         cutree_cols = 3,
         cutree_rows = 2,
         show_rownames = FALSE, show_colnames = TRUE,
         border_color = NA,
         drop_levels = TRUE,
         clustering_method = clust_method,
         clustering_distance_rows = dist_method,
         clustering_distance_cols = dist_method,
         breaks = seq(-3, 3, 0.05),
         color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])

```

```{r log_transform}
assayDataElement(object = target_Data, elt = "log_q") <-  assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")
log_q <-as.data.frame(assayDataElement(target_Data, elt= "log_q"))
#batch <-as.data.frame(assayDataElement(target_Data, elt= "batch"))
```

# 6.5.0 Create subset of data

If no subset is needed set subset_needed on FALSE, otherwise TRUE.

```{r define_active_aois}
# determine AOIs to use
subset_needed <- FALSE

#active_aois<-rownames(ann)[ann$patient=="p4"]
active_aois<- names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))
```

# 6.5.1 Clustering high CV genes for subset {.tabset .tabset-pills}

Calculating CV values

```{r clustering_subset, fig.width=10,fig.height=15}
# create a log2 transform of the data for analysis
assayDataElement(object = target_Data, elt = "log_q") <-
  assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")

# create CV function
calc_CV <- function(x) {sd(x) / mean(x)}
CV_dat <- assayDataApply(target_Data[,active_aois],
                         elt = "log_q", MARGIN = 1, calc_CV)

```

## Table of CV values

```{r}
# show the highest CD genes and their CV values
datatable(as.data.frame(CV_dat),
          extensions = 'Buttons', options = list (
            order = list(1, 'desc'),
            dom = 'Bftrip',
            buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
          ), caption = "CV values of genes" 
) %>% formatRound(columns=c("CV_dat"), digits=3)

```

## Heatmap on of subset, genes in the top 3rd of the CV values {.active}

```{r, fig.width=20,fig.height=15, eval=subset_needed}
# choose clustering methods for heatmaps
clust_method <- "average"    # options: average, ward.D, ward.D2, single, complete, mcquitty, median and centroid.
# choose clustering distance methods for heatmaps
dist_method <- "correlation" # options: correlation, euclidean, maximum, manhattan, canberra, binary and minkowski.

# Identify genes in the top 3rd of the CV values
GOI <- names(CV_dat)[CV_dat > quantile(CV_dat, 0.75)]
pheatmap(assayDataElement(target_Data[GOI,active_aois ], elt = "log_q"),
        scale = "row",
        fontsize_row = 5,
        cutree_cols = 3,
        cutree_rows = 2,
        show_rownames = FALSE, show_colnames = TRUE,
        border_color = NA,
        clustering_method = clust_method,
        clustering_distance_rows = dist_method,
        clustering_distance_cols = dist_method,
        breaks = seq(-3, 3, 0.05),
        color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
       annotation_colors = color_list,
        annotation_col =
          pData(target_Data)[, ann_names])
```

# 7.1 Differential Expression

![](L:/pkloosterman/Kidney_organ_atlas/kidney_structure.png)

## t-test

```{r ttest}
#gc()
plots<-list()
tables<-list()
labels<-list()
test<-"ttest"
mtc<-"BY"
#options: "holm"       "hochberg"   "hommel"     "bonferroni" "BH"         "BY"         "fdr" 
counter=1

comps_df<-data.frame(comp='',val='')

for(region in unique(pData(target_Data)$ANN3)) {
  for (active_group1 in unique(pData(target_Data)$ANN1)) {
    for (active_group2 in unique(pData(target_Data)$ANN1)) {
      
      #supress reduncant compares
      if(active_group1==active_group2) {next}
      comp<-paste(sort(c(region, active_group1,active_group2)),collapse = "_")
      if(comp %in% comps_df$comp) {next}
      temp_df<-data.frame(comp=comp ,val=1)
      comps_df<-rbind(comps_df,temp_df)
      
      labels[[counter]]<-paste(active_group1," vs ", active_group2)
      group1<-log_q[,names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))[pData(target_Data)$ANN1==active_group1 & pData(target_Data)$ANN3==region]]
      group2<-log_q[,names(as.data.frame(assayDataElement(target_Data, elt= "exprs")))[pData(target_Data)$ANN1==active_group2 & pData(target_Data)$ANN3==region]]
      
      #run t_tests  
      results<-as.data.frame ( apply(log_q, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) )
      colnames(results)<-"raw_p_value"
      
      #multiple_testing_correction
      adj_p_value<- p.adjust(results$raw_p_value,method=mtc)
      results<-cbind(results,adj_p_value)
      
      #calc_fdr
      FDR<- p.adjust(results$raw_p_value,method="fdr")
      results<-cbind(results,FDR)
      
      #fold_changes
      #as base data is already log transformed, means need to be subtracted to get FC in log space
      fchanges<-as.data.frame(apply(log_q, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]))))
      colnames(fchanges)<-"FC"
      #paste("FC",active_group1," / ",active_group2)
      results<-cbind(results,fchanges)
      
      #add genenames
      results$Gene<-rownames(results)
      
      #set categories based on P-value & FDR for plotting
      results$Color <- "NS or FC < 1"
      results$Color[results$raw_p_value < 0.05] <- "P < 0.05"
      results$Color[results$FDR < 0.05] <- "FDR < 0.05"
      results$Color[results$FDR < 0.001] <- "FDR < 0.001"
      results$Color[abs(results$FC) <1] <- "NS or FC < 1"
      results$Color <- factor(results$Color,
                              levels = c("NS or FC < 1", "P < 0.05", "FDR < 0.05", "FDR < 0.001"))
      
      #vulcanoplot
      
      # pick top genes for either side of volcano to label
      # order genes for convenience:
      
      results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC)
      top_g <- c()
      top_g <- c(top_g,
                 results[, 'Gene'][
                   order(results[, 'invert_P'], decreasing = TRUE)[1:20]],
                 results[, 'Gene'][order(results[, 'invert_P'], decreasing = FALSE)[1:20]])
      top_g<- unique(top_g)
      results <- results[, -1*ncol(results)] # remove invert_P from matrix
      
      # Graph results
      
      plots[[counter]]<- ggplot(results,
                                      aes(x = FC, y = -log10(raw_p_value),
                                          color = Color, label = Gene)) +
        geom_vline(xintercept = c(1, -1), lty = "dashed") +
        geom_hline(yintercept = -log10(0.05), lty = "dashed") +
        geom_point() +
        labs(x = paste("Enriched in", active_group2," <- log2(FC) -> Enriched in", active_group1),
             y = "Significance, -log10(P)",
             color = "Significance") +
        scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                      `FDR < 0.05` = "lightblue",
                                      `P < 0.05` = "orange2",
                                      `NS or FC < 0.5` = "gray"),
                           guide = guide_legend(override.aes = list(size = 4))) +
        scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
        geom_text_repel(data = subset(results, (-1>FC| FC>1) & FDR < 0.05 & Gene %in% top_g),
                        point.padding = 0.15, color = "black", size=3.5,
                        min.segment.length = .1, box.padding = .2, lwd = 2,
                        max.overlaps = 50) +
        theme_bw(base_size = 10) +
        theme(legend.position = "bottom") +
        ggtitle(paste("class: ", region," - ", test, mtc,"multitest corr"))
      
      #store tables for display later
      tables[[counter]]<-results
      
      counter = counter+1
      #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
    }
  }
}
grid.arrange(grobs=plots,ncol=2)
```

```{r dump_tables,results='asis'}
#strangly does not appear in html output -> chucks are limited to one datatable per chuck. The htmltools are a way around this but does show them stacked in a box.
results_tables <- htmltools::tagList()
for (c in (2:counter-1)) {
  #Gene %in% GOI
  results_tables[[c]] <- datatable(subset(tables[[c]], Color == c("FDR < 0.001",  "P < 0.05")), 
           rownames=FALSE,
           extensions = c('Buttons'), options = list (
              dom = 'Bftrip',
              buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
            ), height = 550,
           caption = paste("DE results ", labels[[c]]),filter='top') %>% formatRound(columns=c("raw_p_value","adj_p_value","FDR","FC"), digits=3)
  #cat('\n\n<!-- -->\n\n')
}            
results_tables
```

```{r}
print("") # for layout of next section
```

# 7.2 DE analysis with LMM

A common statistical approach is to use a linear mixed-effect model
(LMM). The LMM allows the user to account for the subsampling per
tissue; in other words, we adjust for the fact that the multiple regions
of interest placed per tissue section are not independent observations,
as is the assumption with other traditional statistical tests. The
formulation of the LMM model depends on the scientific question being
asked.

Overall, there are two flavors of the LMM model when used with GeoMx
data: i) with and ii) without random slope.

When comparing features that co-exist in a given tissue section, a
random slope is included in the LMM model. When comparing features that
are mutually exclusive in a given tissue section the LMM model does not
require a random slope.

Mostly, we use patient/sample as Random Intercept when they are combined
on slides.

![](http://useq.nl/wp-content/uploads/2022/12/LMM_setup.png)

```{r LMM region}
# convert test variables to factors
pData(target_Data)$testClass <- factor(pData(target_Data)$ANN1, unique(target_Data$ANN1))
pData(target_Data)[["slide"]] <- factor(pData(target_Data)[["slide_name"]])
#assayDataElement(object = target_Data, elt = "log_q") <- assayDataApply(target_Data, 2, FUN = log, base = 2, elt = "q_norm")

# run LMM:
# formula follows conventions defined by the lme4 package
lmm_results <- c()
for (region in unique(pData(target_Data)$ANN3)) {
    ind <- pData(target_Data)$ANN3 == region

    mixedOutmc <-
        mixedModelDE(target_Data[,ind],
                     elt = "log_q",
                     modelFormula = ~ testClass + (1 | slide),
                     groupVar = "testClass",
                     nCores = parallel::detectCores(),
                     multiCore = TRUE)

    # format results as data.frame
    r_test <- do.call(rbind, mixedOutmc["lsmeans", ])
    tests <- rownames(r_test)
    r_test <- as.data.frame(r_test)
    r_test$Contrast <- tests

    # use lapply in case you have multiple levels of your test factor to
    # correctly associate gene name with it's row in the results table
    r_test$Gene <-
        unlist(lapply(colnames(mixedOutmc),
                      rep, nrow(mixedOutmc["lsmeans", ][[1]])))
    r_test$Subset <- region
    r_test$FDR <- p.adjust(r_test$`Pr(>|t|)`, method = "fdr")
    r_test <- r_test[, c("Gene", "Subset", "Contrast", "Estimate",
                         "Pr(>|t|)", "FDR")]
    
    lmm_results <- rbind(lmm_results, r_test)
}
```

# 7.3 Vulcanoplot + table of LMM

```{r lmm_vulcano, fig.width=20,fig.height=10}
# Categorize Results based on P-value & FDR for plotting
fc_threshold = 1

lmm_results$Color <- paste("NS or FC < ",fc_threshold = 1)
lmm_results$Color[lmm_results$`Pr(>|t|)` < 0.05] <- "P < 0.05"
lmm_results$Color[lmm_results$FDR < 0.05] <- "FDR < 0.05"
lmm_results$Color[lmm_results$FDR < 0.001] <- "FDR < 0.001"
lmm_results$Color[abs(lmm_results$Estimate) < fc_threshold] <- paste("NS or FC < ",fc_threshold = 1)
lmm_results$Color <- factor(lmm_results$Color,
                        levels = c("NS or FC < 1", "P < 0.05",
                                   "FDR < 0.05", "FDR < 0.001"))
counter = 1
plots_lmm <- list()
for(c in unique(lmm_results$Subset) ) {
  lmm_results_slice = lmm_results[lmm_results$Subset == c,]

  # pick top genes for either side of volcano to label
  # order genes for convenience:
  lmm_results_slice$invert_P <- (-log10(lmm_results_slice$`Pr(>|t|)`)) * sign(lmm_results_slice$Estimate)
  
  #loop here over tested conditions if applicable
  top_g <- c()
  top_g <- c(top_g,
    lmm_results_slice[, 'Gene'][
      order(lmm_results_slice[, 'invert_P'], decreasing = TRUE)[1:15]],
    lmm_results_slice[, 'Gene'][
      order(lmm_results_slice[, 'invert_P'], decreasing = FALSE)[1:15]])

  top_g <- unique(top_g)
  
  lmm_results_slice <- lmm_results_slice[, -1*ncol(lmm_results_slice)] # remove invert_P from matrix
  
  # Flip Contrast
  contrast_lab <- as.character(lmm_results_slice$Contrast)
  contrast_lab <- strsplit(contrast_lab, "-")[[1]]
  contrast_lab <- paste(contrast_lab[2], "-", contrast_lab[1])
  
  # Graph results
  plots_lmm[[counter]] <- ggplot(lmm_results_slice,
               aes(x = Estimate, y = -log10(`Pr(>|t|)`),
                   color = Color, label = Gene)) +
          geom_vline(xintercept = c(fc_threshold, -fc_threshold), lty = "dashed") +
          geom_hline(yintercept = -log10(0.05), lty = "dashed") +
          geom_point() +
          labs(
            x = paste(contrast_lab, " log2(FC)"),
               y = "Significance, -log10(P)",
               color = "Significance") +
          scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                        `FDR < 0.05` = "lightblue",
                                        `P < 0.05` = "orange2",
                                        `NS or FC < 1` = "gray"),
                             guide = guide_legend(override.aes = list(size = 4))) +
          scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
          geom_text_repel(data = subset(lmm_results_slice,  `Pr(>|t|)` < 0.001 & abs(lmm_results_slice$Estimate) > fc_threshold & Gene %in% top_g),
                          point.padding = 0.15, color = "black",size=5,
                          min.segment.length = .1, box.padding = .2, lwd = 2,
                          max.overlaps = 50) +
          theme_bw(base_size = 16) +
          theme(legend.position = "bottom") +
          facet_wrap(~Subset, scales = "free_y")
  counter <- counter + 1
}
grid.arrange(grobs=plots_lmm,ncol=2)
```    

```{r table_of_LMM_results region}
#subset(lmm_results, Gene %in% GOI)
datatable(lmm_results, rownames = FALSE,
          extensions = 'Buttons', options = list (
              dom = 'Bftrip',
              buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
            ),
          caption = "DE results for Genes of Interest (>75% CV)",filter='top') %>% formatRound(columns=c("Estimate","Pr(>|t|)","FDR"), digits=3)
```

# 7.4 Plotting Genes of Interest

```{r plot_gene_of_interest, fig.width=10,fig.height=5}
my_gois <-unique(subset(lmm_results, `FDR` < 0.001)$Gene)
tmp_tbl<-subset(lmm_results, Gene %in% my_gois)

my_gois <- c()
for (contrast in unique(tmp_tbl$Contrast)) { # gene of interest for all contrasts
#for (contrast in c("Jux_Glo - Pro_Tub")) {
  ind <- tmp_tbl$Contrast == contrast
  goi <- tmp_tbl[ind, "Gene"][order(tmp_tbl[ind, "Estimate"], decreasing = FALSE)[1:3]]
  my_gois <- c(my_gois, goi)
  goi <- tmp_tbl[ind, "Gene"][order(tmp_tbl[ind, "Estimate"], decreasing = TRUE)[1:3]]
  my_gois <- c(my_gois, goi)
}

if(nrow(tmp_tbl) > 1) { 
  datatable(tmp_tbl,rownames = FALSE,caption = "DE results for Genes of Interest ",filter='top') %>% formatRound(columns=c("Estimate","Pr(>|t|)","FDR"), digits=3)
 
for (my_goi in my_gois) {
# show expression for a single target
  a<-ggplot(pData(target_Data),
       aes(x = ANN2, fill = ANN2,
           y = as.numeric(assayDataElement(target_Data[my_goi, ], elt = "q_norm")))) +
  geom_violin() +
  geom_jitter(width = .2) +
  labs(y = paste(my_goi,"Expression")) +
  scale_y_continuous(trans = "log2") +
  #facet_wrap(~ANN3, nrow=1) + theme_bw(base_size = 16) +
  theme_bw() + coord_flip()
  print(a)
}
}else{
  print("No significant lMM results to plot")
}
```

# 7.5 Heatmap of Significant Genes

In addition to generating individual gene box plots or volcano plots, we
can again create a heatmap from our data. This time rather than
utilizing CV to select genes, we can use the P-value or FDR values to
select genes. Here, we plot all genes with an FDR \< 0.001.

```{r heatmap_sig_genes, fig.width=20,fig.height=20}
my_gois <-unique(subset(lmm_results, `FDR` < 0.001)$Gene)   # 100 to prevent long runtime

if(length(my_gois)<2) {
  print("No significant results to show")
 
}else{

pheatmap(log2(assayDataElement(target_Data[my_gois, ], elt = "q_norm")),
         scale = "row",
         show_rownames = TRUE, show_colnames = TRUE,
         border_color = NA,
         clustering_method = "average",
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         cutree_cols = 3, cutree_rows = 2,
         breaks = seq(-3, 3, 0.05),
         color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])
}
```

# 8 Pathway Analysis

Pathway analysis enables exploration of different aggregate gene sets
for our experimental questions. Each individual ROI/AOI segment is
scored for every pathway of interest, which we can then use to
investigate biological differences. We will perform analogous analyses
as those outlined in the Differential Expression and Spatial
Deconvolution sections of the report for gene set enrichment.

# 8.1 Scoring Gene Sets

Pathways and gene sets were defined from the Kegg Brite database. We use
an R software package called Gene Set Variation Analysis to score each
segment within our study. see
<https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html>
for options on collections. We use the KEGG and REACTOME.

```{r build_genesets}
#gc()
h_gene_sets = rbind(msigdbr(species = "human", subcategory = "CP:KEGG"),
                    msigdbr(species = "human", subcategory = "CP:REACTOME"))
#msigdbr(species = "human", subcategory = "CP:BIOCARTA")

msigdbr_list = split(x = h_gene_sets$gene_symbol, f = h_gene_sets$gs_name)

# prepare df for accurate merging back genes later
pw_gene_df<-data.frame(Pathway = names(msigdbr_list))
pw_gene_df$genes<-msigdbr_list
```

```{r run_gsva}
ssgsea_results <- GSVA::gsva(expr = assayDataElement(target_Data,
                            elt = "log_q"),
                            gset.idx.list = msigdbr_list,
                            method = "ssgsea",
                            min.sz = 5,
                            max.sz = 500,
                            verbose = FALSE)
geneSetObj <-
  NanoStringGeoMxSet(assayData = ssgsea_results,
                     phenoData = AnnotatedDataFrame(pData(target_Data)),
                     protocolData = protocolData(target_Data),
                     featureType = "GeneSet",
                     check = FALSE)
```

# 8.2 Differental analysis of pathways

```{r LMM_of_pathway_analisis region}
# # convert test variables to factors
pData(target_Data)$testClass <- factor(pData(target_Data)$ANN1, unique(target_Data$ANN1))
pData(geneSetObj)[["slide"]]<-factor(pData(geneSetObj)[["slide_name"]])
#pData(target_Data)$testRegion<-factor(pData(target_Data)$ANN2, unique(count_mat$ANN3))

lmm_ssgsea_results <- c()
for (region in unique(pData(target_Data)$ANN3)) {
  ind <- pData(target_Data)$ANN3 == region
  mixedOutmc <-
    mixedModelDE(geneSetObj[, ind],
                 elt = "exprs",
                 modelFormula = ~ testClass + (1 | slide),
                 groupVar = "testClass",
                 nCores = parallel::detectCores(),
                 multiCore = TRUE)

  # format results as data.frame
  r_ssgsea_test <- do.call(rbind, mixedOutmc["lsmeans", ])
  ssgsea_tests <- rownames(r_ssgsea_test)
  r_ssgsea_test <- as.data.frame(r_ssgsea_test)
  r_ssgsea_test$Contrast <- ssgsea_tests
  #r_ssgsea_test$Genes <- msigdbr_list seems unreliable as gsva omits pathways if genes are not in data..

  # use lapply in case you have multiple levels of your test factor to
  # correctly associate gene name with it's row in the results table
  r_ssgsea_test$Pathway <-
    unlist(lapply(colnames(mixedOutmc),
                  rep, nrow(mixedOutmc["lsmeans", ][[1]])))

  #r_ssgsea_test$Subset <- status
  r_ssgsea_test$FDR <- p.adjust(r_ssgsea_test$`Pr(>|t|)`, method = "fdr")
  r_ssgsea_test$Subset <- region
  r_ssgsea_test <- r_ssgsea_test[, c("Pathway", "Subset", "Contrast", "Estimate",
                                     "Pr(>|t|)", "FDR")]
  lmm_ssgsea_results <- rbind(lmm_ssgsea_results, r_ssgsea_test)

  }
  lmm_ssgsea_results <- merge(lmm_ssgsea_results, pw_gene_df)
```

# 8.2.1 Table of Differental analysis of pathways

```{r table_of_LMM_pathway_results region, fig.width=20,fig.height=20}
datatable(subset(lmm_ssgsea_results), rownames = FALSE,
          extensions = 'Buttons', options = list (
             pageLength = 10,
              dom = 'Bftrip',
              buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
            ),
          caption = "DE results for Pathways",filter='top') %>% formatRound(columns=c("Estimate","Pr(>|t|)","FDR"), digits=5)
```

# 8.3 Vulcanoplot of LMM_Pathways

```{r lmm_pw_vulcano, fig.width=20,fig.height=10}
# Categorize Results based on P-value & FDR for plotting
fc_threshold = 0.3

lmm_ssgsea_results$Color <- "NS or FC < 0.3"
lmm_ssgsea_results$Color[lmm_ssgsea_results$`Pr(>|t|)` < 0.05] <- "P < 0.05"
lmm_ssgsea_results$Color[lmm_ssgsea_results$FDR < 0.05] <- "FDR < 0.05"
lmm_ssgsea_results$Color[lmm_ssgsea_results$FDR < 0.001] <- "FDR < 0.001"
lmm_ssgsea_results$Color[abs(lmm_ssgsea_results$Estimate) < fc_threshold] <- "NS or FC < 0.3"
lmm_ssgsea_results$Color <- factor(lmm_ssgsea_results$Color,
                        levels = c("NS or FC < 0.3", "P < 0.05",
                                   "FDR < 0.05", "FDR < 0.001"))

# pick top pw for either side of volcano to label
# order pw for convenience:
lmm_ssgsea_results$invert_P <- (-log10(lmm_ssgsea_results$`Pr(>|t|)`)) * sign(lmm_ssgsea_results$Estimate)
top_ssgsea_g <- c()

#loop here over tested conditions if applicable

for(c in unique(lmm_ssgsea_results$Subset) ) {
  lmm_results_slice = lmm_ssgsea_results[lmm_ssgsea_results$Subset == c,]
 top_ssgsea_g <- c(top_ssgsea_g,
         lmm_ssgsea_results[ind, 'Pathway'][
               order(lmm_ssgsea_results[ind, 'invert_P'], decreasing = TRUE)[1:15]],
         lmm_ssgsea_results[ind, 'Pathway'][
               order(lmm_ssgsea_results[ind, 'invert_P'], decreasing = FALSE)[1:15]])
}
 
top_ssgsea_g <- unique(top_ssgsea_g)
lmm_ssgsea_results <- lmm_ssgsea_results[, -1*ncol(lmm_ssgsea_results)] # remove invert_P from matrix

#lmm_ssgsea_results_slice <- lmm_ssgsea_results_slice[lmm_ssgsea_results_slice$FDR < 1,]

# Graph results
ggplot(lmm_ssgsea_results,
       aes(x = Estimate, y = -log10(`Pr(>|t|)`),
           color = Color, label = Pathway)) +
    geom_vline(xintercept = c(0.2, -0.2), lty = "dashed") +
    geom_hline(yintercept = -log10(0.05), lty = "dashed") +
    geom_point() +
    labs(x = paste(lmm_results_slice$Contrast, "log2(FC)"),
         y = "Significance, -log10(P)",
         color = "Significance") +
    scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                  `FDR < 0.05` = "lightblue",
                                  `P < 0.05` = "orange2",
                                  `NS or FC < 0.5` = "gray"),
                       guide = guide_legend(override.aes = list(size = 4))) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
    geom_text_repel(data = subset(lmm_ssgsea_results, Color == "FDR < 0.05" | Color == "FDR < 0.001"),
                   point.padding = 0.15, color = "black",size=3,
                   min.segment.length = .1, box.padding = .2, lwd = 2,
                   max.overlaps = 50) +
    theme_bw(base_size = 16) +
    theme(legend.position = "bottom") +
    facet_wrap(~Subset, scales = "free_y")

#    +facet_wrap(~Subset, scales = "free_y"))
```

# 8.4 heatmap of pathways

```{r pw_heatmap, fig.width=20,fig.height=20}
  active_pw<-filter(lmm_ssgsea_results, `Pr(>|t|)` < 0.001)$Pathway
  active_pw<-filter(lmm_ssgsea_results, FDR < 0.001 )$Pathway
  #active_pw<-filter(lmm_ssgsea_results, Color == "FDR < 0.001")$Pathway
  
  
  active_pw<-top_ssgsea_g
  
  if (length(active_pw)>1) {
  
    print("go")
    
  pw_matrix<-assayDataElement(geneSetObj, elt = "exprs")

  pheatmap(pw_matrix[active_pw,],
         scale = "row",
         show_rownames = TRUE, show_colnames = TRUE,
         fontsize_row = 10,
         border_color = NA,
         clustering_method = "average",
         #clustering_distance_rows = "correlation",
         clustering_distance_cols = "euclidean",
         cutree_cols = 2, cutree_rows = 2,
         breaks = seq(-3, 3, 0.05),
         #color = colorRampPalette(c("purple3", "black", "yellow2"))(120),
         main = "Heatmap of selected Pathways",
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])
  
  }else{
    print("No significant results to display")
  }
```

# 8.5 fgsea pathway analysis

Another option for pathway analysis is the R software package called Fast Gene Set Enrichment Analysis. It is a pre-ranked method that uses the LMM results combined with the same pathway list from msigdbr as GSVA.

```{r fgsea}
fgsea_results_all <- c()
for (contrast in unique(lmm_results$Contrast)) {
    
    ranks <- lmm_results$Estimate[lmm_results$Contrast == contrast]
    names(ranks) <- lmm_results$Gene[lmm_results$Contrast == contrast]
    
    fgsea_results <- fgsea(msigdbr_list, 
                         ranks, 
                         eps = 0.0,
                         minSize=15, 
                         maxSize = 500)
    fgsea_results$Contrast <- contrast
    fgsea_results_all <- rbind(fgsea_results_all, fgsea_results)
    
}

fgsea_reactome <- fgsea_results_all[grep("REACTOME", pathway),]
fgsea_reactome$pathway <- gsub("REACTOME_", "", fgsea_reactome$pathway)
fgsea_reactome$pathway <- gsub("_", " ", fgsea_reactome$pathway)
```

```{r fgsea_pw_vulcano, fig.width=20,fig.height=10}
# Categorize Results based on P-value & FDR for plotting
fc_threshold = 0.3


fgsea_results_all$Color <- "NS or FC < 0.3"
fgsea_results_all$Color[fgsea_results_all$pval < 0.05] <- "P < 0.05"
fgsea_results_all$Color[fgsea_results_all$padj < 0.05] <- "FDR < 0.05"
fgsea_results_all$Color[fgsea_results_all$padj < 0.001] <- "FDR < 0.001"
fgsea_results_all$Color[abs(fgsea_results_all$ES) < fc_threshold] <- "NS or FC < 0.3"
fgsea_results_all$Color <- factor(fgsea_results_all$Color,
                        levels = c("NS or FC < 0.3", "P < 0.05",
                                   "FDR < 0.05", "FDR < 0.001"))

plt <- htmltools::tagList()
counter = 1
for(c in unique(fgsea_results_all$Contrast) ) {
  fgsea_results_slice = fgsea_results_all[fgsea_results_all$Contrast == c,]

  # # pick top genes for either side of volcano to label
  # # order genes for convenience:
  fgsea_results_slice$invert_P <- (-log10(fgsea_results_slice$pval)) * sign(fgsea_results_slice$ES)
  
  # #loop here over tested conditions if applicable
  #top_fgsea_g <- c()
  top_fgsea_g <- c(fgsea_results_slice[, 'pathway'][
                        order(fgsea_results_slice[, 'invert_P'], decreasing = TRUE)[1:15]],
                    fgsea_results_slice[, 'pathway'][
                        order(fgsea_results_slice[, 'invert_P'], decreasing = FALSE)[1:15]])
   
  # for (celltype in unique(lmm_results$Subset)) {
  #      top_fgsea_g <- c(top_fgsea_g,
  #                  fgsea_results_slice[fgsea_results_slice$Subset==celltype, 'pathway'][
  #                      order(fgsea_results_slice[, 'invert_P'], decreasing = TRUE)[1:20]],
  #                  fgsea_results_slice[fgsea_results_slice$Subset==celltype, 'pathway'][
  #                      order(fgsea_results_slice[, 'invert_P'], decreasing = FALSE)[1:20]])
  # }
  
  top_fgsea_g <- unique(top_fgsea_g)
  #fgsea_results_slice <- fgsea_results_slice[, -1*ncol(fgsea_results_slice)] # remove invert_P from matrix
  
  # Graph results
  dynplot <- ggplot(fgsea_results_slice,
         aes(x = ES, y = -log10(pval),
             color = Color, label = pathway)) +
      geom_vline(xintercept = c(fc_threshold,-fc_threshold), lty = "dashed") +
      geom_hline(yintercept = -log10(0.05), lty = "dashed") +
      geom_point() +
      labs(x = paste(c, " FC"),
           y = "Significance, -log10(P)",
           color = "Significance") +
      scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                    `FDR < 0.05` = "lightblue",
                                    `P < 0.05` = "orange2",
                                    `NS or FC < 0.3` = "gray"),
                         guide = guide_legend(override.aes = list(size = 4))) +
      scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
      geom_text_repel(data = subset(fgsea_results_slice, Color == "P < 0.05" | Color == "FDR < 0.05" | Color == "FDR < 0.001"),
                     point.padding = 0.15, color = "black",size=5,
                     min.segment.length = .1, box.padding = .2, lwd = 2,
                     max.overlaps = 50) +
      theme_bw(base_size = 16) +
      theme(legend.position = "bottom") #+
      #facet_wrap(~Subset, scales = "free_y")
   
  plt[[counter]] <- as_widget(ggplotly(dynplot))
  counter <- counter + 1
  #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
}
plt
```

## Barplot pathways

Clear vizualization of the top 15 positive and negative expressed Reactome pathways. Pathways with a adjusted p-value of 0.05 or higher will be presented as red 
and below 0.05 as green.

Fgsea returns results from the given databases. To avoid one database overshadowing the other, results have been split to show database specific results. 
Reactome is shown in barplots and KEGG as a pathway image including highlighted genes. 

```{r horizontal pathway bar plot, fig.width=20,fig.height=20}
top_all <- c()
for (contrast in unique(fgsea_reactome$Contrast)) {
    # active_group1 <- contrast_list[contrast][[1]][[1]]
    # active_group2 <- contrast_list[contrast][[1]][[2]]
    # ind <- pData(target_Data)[pData(target_Data)$ANN2 == active_group1 | pData(target_Data)$ANN2 == active_group2]
    top <- fgsea_reactome[fgsea_reactome$Contrast == contrast]
    toppos <- top[order(top$NES, decreasing = TRUE)][0:15]
    topneg <- top[order(top$NES, decreasing = FALSE)][0:15]
    top <- rbind(toppos, topneg)
    
    top$Color[top$padj < 0.05] <- "padj < 0.05"
    top$Color[top$padj == 0.05 | top$padj > 0.05] <- "padj > 0.05"
    top_all <- rbind(top_all, top)
    barplot <- ggplot(top,aes(x=reorder(pathway, NES),y=NES,fill=Color)) + 
      scale_x_discrete(labels = function(x) str_wrap(x, width = 100)) + 
      geom_col(position="dodge") + scale_fill_manual(values = c("#2ECC71", "#E74C3C")) +
      ggtitle(paste("Top 15 + and - NES reactome pathways from", contrast)) + 
      xlab("pathway") + ylab("Normalized Enrichment Score") +
      coord_flip() + 
      theme(text = element_text(size = 20)) +
      theme(panel.background = element_blank())
      #facet_wrap(~Contrast, scales = "free_y")
    print(barplot)
}

```

# 8.6 Visualize KEGG pathway

Get pathway visualization from KEGG. clusterProfiler grabs the KEGG pathway IDs and combined with pathview it can download a pathway image from the KEGG database.
The LMM results are given with the pathway id to visualize up and down regulated genes. It will pick the most significant pathway if not manually specified.

```{r}
get_wp_gmtfile <- function() {
    wpurl <- 'https://wikipathways-data.wmcloud.org/current/gmt/'
    x <- readLines(wpurl)
    y <- x[grep('\\.gmt',x)]
    sub(".*(wikipathways-.*\\.gmt).*", "\\1",  y[grep('File', y)])
}

get_wp_data <- function(organism) {
    organism <- sub(" ", "_", organism)
    gmtfile <- get_wp_gmtfile()
    wpurl <- 'https://wikipathways-data.wmcloud.org/current/gmt/'
    url <- paste0(wpurl,
                  gmtfile[grep(organism, gmtfile)])
    f <- tempfile(fileext = ".gmt")
    dl <- mydownload(url, destfile = f)
    if (is.null(f)) {
        message("fail to download wikiPathways data...")
        return(NULL)
    }
    read.gmt.wp(f)
}
```


```{r get online available pathways, fig.width=20,fig.height=10}
# enrichKEGG grab online pathways
gene <-  target_Data@featureData@data[["GeneID"]]
kk <- enrichKEGG(gene         = gene,
                 organism     = 'hsa',
                 pvalueCutoff = 0.1)

enrichKEGG_results <- kk@result

# Match column values for link
# fsgea run
fgsea_results$Description <- gsub("KEGG_", "", fgsea_results$pathway)
fgsea_results$Description <- gsub("_", " ", fgsea_results$Description)
# ssgea run
# lmm_ssgsea_results_d$Description <- gsub("KEGG_", "", lmm_ssgsea_results_d$Pathway)
# lmm_ssgsea_results_d$Description <- gsub("_", " ", lmm_ssgsea_results_d$Description)

enrichKEGG_results$Description <- toupper(enrichKEGG_results$Description)

# Create df for matching KEGG results
KEGGIDs <- fgsea_results#[fgsea_results$Contrast == "Pro_Tub - Dis_Tub"]
KEGGIDs <- KEGGIDs %>% inner_join(enrichKEGG_results, by = 'Description') %>% select(Description, ID, geneID, leadingEdge, padj)

# Create geneList with DE results for visualization pathway genes 
genelist <- lmm_results$Estimate
names(genelist) <- target_Data@featureData@data[["GeneID"]]

# Choose manual or highest significant pathway
pathway_name <- "INSULIN SIGNALING PATHWAY" # manual pathway query
pathway_name <- toupper(pathway_name)
if (pathway_name == "") {
  pathway_name <- KEGGIDs[order(KEGGIDs$padj, decreasing = FALSE), ][1]$Description
}
pathwayid <- KEGGIDs$ID[KEGGIDs$Description == pathway_name]

# Grab pathway image with gene info, save and plot
viewPath <- pathview(gene.data  = genelist,
                     pathway.id = pathwayid,
                     species    = "hsa",
                     out.suffix = "genesinfo", kegg.native = T, same.layer = F)
img <- readPNG(paste(pathwayid, ".genesinfo.png", sep = ""))
grid::grid.raster(img)
```

# 9 Spatial Deconvolution

## 9.1 Calculate backgrounds

```{r spatial_decon_bg}
#gc()
bg = derive_GeoMx_background(
  norm = assayDataElement(target_Data , elt = "q_norm"),
  probepool = fData(target_Data)$Module,
  negnames = c("NegProbe-CTP01","NegProbe-Kilo","Negative Probe", "NegProbe-WTX" ))
  #negnames = "NegProbe-WTX")

```

## 9.2 Load cell profile

A "cell profile matrix" is a pre-defined matrix that specifies the
expected expression profiles of each cell type in the experiment. The
SpatialDecon library comes with one such matrix pre-loaded, the
"SafeTME" matrix, designed for estimation of immune and stroma cells in
the tumor microenvironment. (This matrix was designed to avoid genes
commonly expressed by cancer cells; see the SpatialDecon manuscript for
details.). Otherwise, load specific profiles from
<https://github.com/Nanostring-Biostats/CellProfileLibrary/tree/NewProfileMatrices>

```{r load_cell_profiles}
#safeTME
data("safeTME")
data("safeTME.matches")
current_cell_profile<-safeTME

#see: https://github.com/Nanostring-Biostats/CellProfileLibrary/tree/NewProfileMatrices

current_cell_profile <- download_profile_matrix(species = "Human",
                                               age_group = "Adult",
                                               matrixname = "Kidney_HCA")

heatmap(sweep(current_cell_profile, 1, apply(current_cell_profile, 1, max), "/"),
        labRow = NA, margins = c(10, 5), cexCol = 0.7)
```

# 9.3 Run spatial deconvolution

```{r spatial_decon_run}
# vector identifying pure tumor segments:
#target_Data$istumor = target_Data$ANN3 == "CORE" & target_Data$ANN1 == "PanCK+"
res = runspatialdecon(object = target_Data,
                      norm_elt = "q_norm",
                      raw_elt = "exprs",
                      #is_pure_tumor = target_Data$istumor,
                      cell_counts = target_Data$nuclei,
                      X = current_cell_profile,
                      #cellmerges = safeTME.matches,              # safeTME.matches object, used by default
                      #n_tumor_clusters = 5,                      # how many distinct tumor profiles to append to safeTME
                      align_genes = TRUE)


```

# 9.3.1 Spatial deconvolution heatmaps {.tabset .tabset-pills}

## Abundance

```{r spatial_decon_heatmap, fig.width=25,fig.height=15}
# NOTE: check clustering.. why different?

#set display thresholds
thresh <- signif(quantile(res$beta, 0.97), 2)

# plot stored to keep clustering for later
p1<-pheatmap(pmin(t(res$beta),thresh),
         #scale = "row", 
         cutree_cols = 3,
         cutree_rows = 2,
         fontsize_row = 12,
         show_rownames = TRUE, show_colnames = TRUE,
         angle_col = "90",
         border_color = NA,
         #clustering_method = "average",
         #clustering_distance_rows = "correlation",
         #clustering_distance_cols = "correlation",
         legend_breaks = c(round(seq(0, thresh, length.out = 5))[-5], thresh),
         legend_labels = c(round(seq(0, thresh, length.out = 5))[-5], paste0("Abundance scores,\ntruncated above at ", thresh)),
         #breaks = seq(0, 5, 1),
         color = colorRampPalette(c("white","darkblue"))(100),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names]
         )
#p1
```

## Proportional

```{r spatial_decon_propheatmap, fig.width=25,fig.height=15}
# proportions:
props <- replace(res$prop_of_nontumor, is.na(res$prop_of_nontumor), 0)

p2<-pheatmap(t(props),
         #scale = "row", 
         cutree_cols = 3,
         cutree_rows = 2,
         fontsize_row = 12,
         show_rownames = TRUE, show_colnames = TRUE,
         angle_col = "90",
         border_color = NA,
         #clustering_method = "average",
         #clustering_distance_rows = "correlation",
         #clustering_distance_cols = "correlation",
         legend_breaks = round(seq(0, max(props) * 0.99, length.out = 5), 2),
         legend_labels = c(round(seq(0, max(props), length.out = 5), 2)[-5], "Proportion of all\nfitted populations"),
         color = colorRampPalette(c("white","darkblue"))(100),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])

#p2

```

## Scaled

```{r spatial_decon_scaledheatmap, fig.width=25,fig.height=15}
# scaled abundances:
epsilon <- min(res$beta[res$beta > 0])
mat <- sweep(res$beta, 1, pmax(apply(res$beta, 1, max), epsilon), "/")

pheatmap(t(mat),
         #scale = "row",
         cutree_cols = 3,
         cutree_rows = 3,
         fontsize_row = 12,
         show_rownames = TRUE, show_colnames = TRUE,
         angle_col = "90",
         border_color = NA,
         #clustering_method = "average",
         #clustering_distance_rows = "correlation",
         #clustering_distance_cols = "correlation",
         legend_breaks = c(round(seq(0, 1, length.out = 5), 2)[-5], 1),
         legend_labels = c(round(seq(0, 1, length.out = 5), 2)[-5], "Scaled abundance\n(ratio to max)"),
         color = colorRampPalette(c("white","darkblue"))(100),
         annotation_colors = color_list,
         annotation_col = pData(target_Data)[, ann_names])

```

# 9.4 Barplots {.tabset .tabset-pills}

## abundance

```{r SD_abundance_barplot, fig.width=25,fig.height=15}
# define variables to show in heatmaps:
pData(target_Data)$region <- 
    factor(pData(target_Data)$ANN3, unique(target_Data$ANN3))   # Must be manual?
pData(target_Data)$class <- 
    factor(pData(target_Data)$ANN1, unique(target_Data$ANN1))   # Must be manual?

variables_to_plot <- c("slide_name", "ANN1", "ANN3")                     # Must be manual?

#define colors

# only for safeTME colors
#col <- cellcols

#custom celmatrix
#get large number of colors
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))))
celltypes<-sample(col_vector,length(colnames(res$beta)))
names(celltypes)<-colnames(res$beta)
col<-celltypes


#tempfix for abbreviated and now mismatching annotations
#can just use ann dataframe?
#tmpann<-cbind(ANN1,ANN2,SN)
tmpann <- pData(target_Data)[ann_names]



layout(matrix(c(1, 2, 3, 3), nrow = 2),
       widths = c(10, 3, 10, 3),
       heights = c(1, 8, 10),
      )

par(mar = c(0, 8.2, 0, 0.2))
plot(p1$tree_col, labels = F, main = "", ylab = "", yaxt = "n")
par(mar = c(15, 8, 0, 0))

# data to plot:
mat <- t(res$beta)[, p1$tree_col$order]
# infer scale of negative y-axis for annotation colorbars
ymin <- -max(colSums(mat)) * 0.15
if (!is.finite(ymin)) {
  ymin <- 0
}

# draw barplot:
bp <- barplot(mat,
              cex.lab = 1.5,
              col = col, border = NA,
              cex.names = 1.1,
              las = 2, main = "", ylab = "Abundance scores",
              ylim = c(ymin, max(colSums(mat)))
)


# add color bars for annotations
for (name in rev(variables_to_plot)) {
  yrange <- seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 1)[match(name, variables_to_plot) + c(0, 1)]
  xwidth <- (bp[2] - bp[1]) / 2
  for (i in 1:ncol(mat)) {
    rect(bp[i] - xwidth, yrange[2], bp[i] + xwidth, yrange[1],
         # border = NA, col = ann_colors[[name]][segmentAnnotations[match(colnames(mat)[i], segmentAnnotations$segmentID), name]]
         #border = NA, col = ann_colors[[name]][ann[p1$tree_col$order[i], name]]
         border = NA, col = color_list[[name]][tmpann[colnames(mat)[i], name]]
    )
  }
}
axis(2,
     at = seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 2)[-c(1, length(variables_to_plot) + 2)],
     las = 2, labels = variables_to_plot, lty = 0, cex.axis = 1.2
)

#draw a legend:
par(mar = c(0.1, 0.1, 0.1, 0.1))
frame()
legendcols <- legendnames <- c()
#for (name in rev(names(ann_colors))) {
for (name in c("slide_name", "ANN1", "ANN3")) {
  legendcols <- c(legendcols, NA, color_list[[name]], NA)
  legendnames <- c(legendnames, name, names(color_list[[name]]), NA)
}
legend("center",
       pch = 15,
       cex = 1.5,
       col = c(legendcols, rep(NA, 1), rev(col)),
       legend = c(legendnames, "Cell type", rev(names(col))),
)
```

## proportional

```{r SD_prop_barplot, fig.width=25,fig.height=15}
# define variables to show in heatmaps:
variables_to_plot <- c("slide_name", "ANN1", "ANN3")

layout(matrix(c(1, 2, 3, 3), nrow = 2),
       widths = c(10, 3, 10, 3),
       heights = c(1, 8, 10),
      )

par(mar = c(0, 8.2, 0, 0.2))
plot(p2$tree_col, labels = F, main = "", ylab = "", yaxt = "n")
par(mar = c(15, 8, 0, 0))

# data to plot:
mat <- t(res$prop_of_nontumor)[, p2$tree_col$order]
  mat <- replace(mat, is.na(mat), 0)
  # infer scale of negative y-axis for annotation colorbars
  ymin <- -0.15

# draw barplot:
bp <- barplot(mat,
              cex.lab = 1.5,
              col = col, border = NA,
              cex.names = 1.1,
              las = 2, main = "", ylab = "Proportion of fitted cells",
              ylim = c(ymin, max(colSums(mat)))
)

# add color bars for annotations
for (name in rev(variables_to_plot)) {
  yrange <- seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 1)[match(name, variables_to_plot) + c(0, 1)]
  xwidth <- (bp[2] - bp[1]) / 2
  for (i in 1:ncol(mat)) {
    rect(bp[i] - xwidth, yrange[2], bp[i] + xwidth, yrange[1],
         # border = NA, col = ann_colors[[name]][segmentAnnotations[match(colnames(mat)[i], segmentAnnotations$segmentID), name]]
         #border = NA, col = ann_colors[[name]][ann[p2$tree_col$order[i], name]]
         border = NA, col = color_list[[name]][tmpann[colnames(mat)[i], name]]
    )
  }
}
axis(2,
     at = seq(ymin / 3, ymin, length.out = length(variables_to_plot) + 2)[-c(1, length(variables_to_plot) + 2)],
     las = 2, labels = variables_to_plot, lty = 0, cex.axis = 1.2
)


#draw a legend:
par(mar = c(0.1, 0.1, 0.1, 0.1))
frame()
legendcols <- legendnames <- c()
#for (name in rev(names(ann_colors))) {
for (name in c("slide_name", "ANN1", "ANN3")) {
  legendcols <- c(legendcols, NA, color_list[[name]], NA)
  legendnames <- c(legendnames, name, names(color_list[[name]]), NA)
}
legend("center",
       pch = 15,
       cex = 1.4,
       col = c(legendcols, rep(NA, 1), rev(col)),
       legend = c(legendnames, "Cell type", rev(names(col))),
)
```

## Boxplots

Boxplot per cell from the spatial deconvolution. Choose your annotation in which you want to compare (ANN). 
The annotation will be divided in conditions and each cell with have a boxplot which include the abundance score of each condition.
A statistical test has been added inside the boxplot to check for significance. Feel free to chance the test to fit the data.

significance code         p-value
   ***                 [0, 0.001]
    **              (0.001, 0.01]
     *               (0.01, 0.05]
     .                (0.05, 0.1]
                         (0.1, 1] 

```{r fig.height=40, fig.width=40, warning=FALSE}
ANN <- "ANN1" # ANN column with conditions
if (length(unique(target_Data[[ANN]])) > 2) {
  test <- "kruskal.test"
} else {
  test <- "wilcox.test"
}

res_filter <- res[!res@phenoData@data[["beta"]]==0]
res_filter <- as.data.frame(res_filter$beta)
res_filter[res_filter == 0] <- NA
boxplot_prep <- res_filter[ , colSums(is.na(res_filter)) < nrow(res_filter)] 

boxplot_prep$condition <- rownames(boxplot_prep)

for (value in unique(target_Data[[ANN]])) {
  con <- rownames(pData(target_Data)[pData(target_Data)[[ANN]] == value,])
  boxplot_prep$condition[boxplot_prep$condition %in% con] <- value
}

boxplot <- melt(boxplot_prep)

# function for number of observations 
give.n <- function(x){
  return(data.frame(y = -1, label = paste0("n = ",length(x))))
 #return(c(y = median(x)*1.05, label = length(x))
# experiment with the multiplier to find the perfect position
}

ggplot(boxplot, aes(factor(condition), value, fill = condition)) + 
  geom_boxplot() + 
  geom_jitter(position=position_jitter(w=0.1,h=0.1)) +
  facet_wrap(~variable, scale="free", ncol = 5) +
  theme(text = element_text(size = 20)) +
  rotate_x_text(angle = 45) +
  labs(
    title = "Spatial Deconvolution",
    x = "Conditions",
    y = "Abuncance score") + 
  stat_compare_means(method = test, label.y = -3, size=5) + 
  stat_compare_means(label = "p.signif", method = "t.test",
                     ref.group = ".all.", hide.ns = TRUE, size=5) +
  stat_summary(fun.data = give.n, geom = "text", size = 5)


#stat_compare_means(size=5)
```


# 11 CNV
Copy Number Variation analysis with the R software package inferCNV (https://github.com/broadinstitute/inferCNV/wiki).

InferCNV is used to explore tumor single cell RNA-Seq data to identify evidence for somatic large-scale chromosomal copy number alterations, such as gains or deletions of entire chromosomes or large segments of chromosomes. This is done by exploring expression intensity of genes across positions of tumor genome in comparison to a set of reference 'normal' cells. A heatmap is generated illustrating the relative expression intensities across each chromosome, and it often becomes readily apparent as to which regions of the tumor genome are over-abundant or less-abundant as compared to that of normal cells.

Per patient a cnv analysis with a provided reference group. The gene order is made from the projects .pkc file.

Select the annotation where the CNV analysis should look at specifically as group and specify a subgroup as group_filter if the group of interest is a subgroup inside the annotation. For example if the CNV analysis needs to be only on tumor regions the group would be ANNX(ANN column with the info about the tumor regions) and group_filter PanCK+. 
The annotation that references the patients as patients. If there are no patients leave the parameter empty as "".
The annotation that should be included in the results including a reference set as cnv_target.
This will create patient specific files with all the information inferCNV needs.

```{r create cnv files, include=FALSE}
#########PARAMETERS########
group <- ""
group_filter <- ""

patients <- ""

cnv_target <- "ANN1"
print(paste("Show CNV on", cnv_target, "with the specificity on the groups (if any):", group, ";", group_filter))
###########################

if (!patients == "") {
  patient_list <- unique(target_Data[[patients]])
} else{
  patient_list <- "dummy"
}

file <- readLines(PKCFiles)
file <- gsub(' ', '',
        gsub('"', '', file))                                      # Format and remove space

display <- file[grepl("DisplayName", file, fixed = TRUE)]         # Grab display names
display <- gsub('DisplayName', "",
           gsub('"', "",
           gsub(':', "",
           gsub(',', "",
           gsub(' ', "",
           gsub('_01', "", display))))))                          # Grab only the names
display <- unique(display)                                        # Remove duplicates

chr <- file[grep("GenomeCoordinates", file)+1]                    # Get the chr positions under the GenomeCoordinates line
chr <-gsub(',', "", chr)                                          # Remove unwanted symbols

positions <- data.frame(
  name = display,
  chr = chr)
positions <- positions[!grepl("TargetSequence", positions$chr),]  # Remove entries without coordinates

positions$chr <- gsub(':', '-', positions$chr)                    # Format for splitting
positions$chr <- str_split_fixed(positions$chr, "-", 3)           # Split into chr, begin, and end position
positions <- as.matrix(positions)
positions <- as.data.frame(positions)                             # Swap types to recognize split columns as singular columns

positions$order <- as.numeric(gsub('chr', '', positions$chr.1))
positions <- positions[order(positions$order, decreasing = FALSE), ] # Order the chromosomes.
positions$order <- NULL

colnames(positions) <- NULL
rownames(positions) <- NULL                                       # inferCNV requires no column and row names

write.table(positions, "gene_order_Hs_WTA_v1_pkc.txt",
            sep = "\t",
            quote = FALSE,
            col.names = FALSE,
            row.names = FALSE)                                    # Write away


for (patient in patient_list) {
  if (group_filter != "") {
    group_interest <- target_Data@protocolData@data[["SampleID"]][target_Data[[group]] == group_filter & target_Data[[patients]] == patient]
  } else if (length(patient_list) > 1) {
    group_interest <- target_Data@protocolData@data[["SampleID"]][target_Data[[patients]] == patient]
  } else {
    group_interest <- target_Data@protocolData@data[["SampleID"]]
  }
  
  counts <- target_Data@assayData[["exprs"]]
  colnames(counts) <- gsub('.dcc', '',colnames(counts))
  counts <- counts[,colnames(counts) %in% c(group_interest)]
  write.table(counts, paste0(patient, ".Counts.tsv"), sep = "\t")                    # Make raw count file

  annotation <- target_Data@phenoData@data
  annotation$SampleID <- gsub('.dcc','', rownames(annotation))
  annotation <- annotation[annotation$SampleID %in% c(group_interest),]
  #annotation <- annotation[c("SampleID", "segment")]                  # Select SampleID and the needed annotation (CNV requires SampleID and only 1 annotation)
  annotation <- annotation[c("SampleID", cnv_target)]
  rownames(annotation) <- NULL
  colnames(annotation) <- NULL                                     # inferCNV requires no column and row names
  write.table(annotation, paste0(patient, ".Annotations.tsv"),
              sep = "\t",
              quote = FALSE,
              col.names = FALSE,
              row.names = FALSE)                                   # Make annotation file
}
```

Select the reference set in reference. This can be more than one. Every patient without its own reference will not be analysed.

```{r run CNV, echo=T, message=FALSE, warning=FALSE, results='hide'}
#########PARAMETERS########
reference <- c("normal")
###########################

failed <- c()
cnv_list <- list()

for (patient in patient_list) {
  # Name output folder
  out_dir <- paste0(patient, "_CNV")
  if (str_detect(paste(readLines(paste0(patient, ".Annotations.tsv")), collapse = ''), reference) == FALSE) {
    failed <- c(failed, patient)
    next
    }

  # Create the infercnv object
  infercnv_obj = CreateInfercnvObject(raw_counts_matrix= paste0(patient, ".Counts.tsv"),
                                      annotations_file= paste0(patient, ".Annotations.tsv"),
                                      delim="\t",
                                      gene_order_file= "gene_order_Hs_WTA_v1_pkc.txt",
                                      #gene_order_file= "gencode_v19_gene_pos.txt",
                                      ref_group_names= reference, # input the normal/reference group names
                                      chr_exclude = c("chrM"))#c("chrM")) # Default excludes chrX, chrY and chrM. By only picking chrM you include the X and Y chromosomes.

  # perform infercnv operations to reveal cnv signal. For all options: https://rdrr.io/github/broadinstitute/infercnv/man/run.html
  #cnv_list[[patient]]
  infercnv_obj <- infercnv::run(infercnv_obj,
                               cutoff=0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                               out_dir= out_dir,  # dir is auto-created for storing outputs
                               cluster_by_groups=FALSE,   # cluster
                               denoise=TRUE,
                               HMM=FALSE,
                               tumor_subcluster_partition_method = c("qnorm"),
                               analysis_mode = "subclusters",
                               no_plot=TRUE
                               #,debug=TRUE
                               )

  plot_cnv(infercnv_obj,
          out_dir = paste0(patient, "_CNV"),
          title = "inferCNV",
          obs_title = "Observations (Cells)",
          ref_title = "References (Cells)",
          cluster_by_groups = FALSE,
          cluster_references = FALSE,
          plot_chr_scale = FALSE,
          #chr_lengths = NULL,
          k_obs_groups = 1,
          contig_cex = 1.5,
          #x.center = mean(infercnv_obj@expr.data),
          x.range = "auto",
          #hclust_method = "ward.D",
          output_filename = "infercnv",
          output_format = "png",
          png_res = 300
          )
}
```

```{r show CNV, fig.height=30, fig.width=25}
for (patient in patient_list) {
  print(paste(group_filter, patient))
  if (patient %in% failed) {
    print(" ^    Patient contained no reference group")
    next
  }
  img <- readPNG(paste(paste0(patient, "_CNV"), "/infercnv.png", sep = ""))
  grid::grid.newpage()
  grid::grid.raster(img)
}
```

# 11.1 Dendrogram

Closer look at the dendrogram from the CNV analysis per patient where the nodes as written numbers. Use the numbers to select subgroups for further analysis in 't-test on two subgroups'.

```{r, fig.width=20,fig.height=20}
#out_dir <- "T1_NANO_012_CNV"
trees <- list()

for (patient in patient_list) {
  if (patient %in% failed) {
    #print("Patient contained no reference group")
    next
  }
tree <- read.tree(paste(patient,"_CNV/infercnv.observations_dendrogram.txt", sep = ""))
obv <- read.csv(paste(patient,"_CNV/infercnv.observation_groupings.txt", sep = ""), sep="")
trees[[patient]] <- ggtree(
  tree, ladderize=F) +
  geom_treescale() +
  geom_tiplab(color=obv$Annotation.Color, hjust=-.2) +
  coord_cartesian(clip = 'off') +
  theme_tree2(plot.margin=margin(6, 200, 6, 6)) +
  geom_text2(aes(label=node), hjust=-.3, size = 3) +
  ggplot2::labs(title = patient)
}

grid.arrange(grobs=trees,ncol=3)
```

## t-test chr with groups

Compare the contrast within a chromosome. Select a chromosome, contrast, and patient of interest to run a t-test on.

```{r}
#########PARAMETERS########
chr <- "chrX" # Select chromosome of interest
contrast <- c("normal", "DKD") # Select contrast
patient <- "dummy"
###########################

positions <- as.data.frame(positions)
colnames(positions) <- c("gene", "chr", "begin", "end")
select_genes <- positions$gene[positions$chr == chr] # Grab chr specific genes

annotation <- as.data.frame(read.delim(paste0(patient, ".Annotations.tsv"), header=FALSE))
colnames(annotation) <- c("Sample_ID", "ANN")
select_samples <- annotation

filter_counts <- as.data.frame(target_Data@assayData[["log_q"]])
colnames(filter_counts) <- gsub('.dcc','', colnames(filter_counts))
filter_counts <- filter_counts[,select_samples$Sample_ID] # filter out samples that are not the interesting region

select_genes <- select_genes[select_genes %in% rownames(filter_counts)] # Only use genes that are actually in the data (pkc gene file has all of them)
filter_counts <- filter_counts[select_genes,] # filter out non chromosome specific genes
```

```{r ttest chr, fig.width=20,fig.height=10 }
plots<-list()
tables<-list()
labels<-list()
test<-"ttest"
mtc<-"BH"
counter=1

log_q_filter <-as.data.frame(filter_counts)

comps_df<-data.frame(comp='',val='')

for (active_group1 in contrast) {
    for (active_group2 in contrast) {

      #supress reduncant compares
      if(active_group1==active_group2) {next}
      comp<-paste(sort(c(active_group1,active_group2)),collapse = "_")
      #print(comp)
      if(comp %in% comps_df$comp) {next}
      temp_df<-data.frame(comp=comp ,val=1)
      comps_df<-rbind(comps_df,temp_df)

      labels[[counter]]<-paste(active_group1," vs ", active_group2)
      group1<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group1]]
      group2<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group2]]

      #run t_tests
      results<-as.data.frame ( apply(log_q_filter, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) )
      colnames(results)<-"raw_p_value"

      #multiple_testing_correction
      adj_p_value<- p.adjust(results$raw_p_value,method=mtc)
      results<-cbind(results,adj_p_value)

      #calc_fdr
      FDR<- p.adjust(results$raw_p_value,method="fdr")
      results<-cbind(results,FDR)

      #fold_changes
      #as base data is already log transformed, means need to be subtracted to get FC in log space
      fchanges<-as.data.frame( apply(log_q_filter, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]) ) ) )
      colnames(fchanges)<-"FC"
      results<-cbind(results,fchanges)

      #add genenames
      results$Gene<-rownames(results)

      #set categories based on P-value & FDR for plotting
      results$Color <- "NS or FC < 0.5"
      results$Color[results$adj_p_value < 0.05] <- "P < 0.05"
      results$Color[results$FDR < 0.05] <- "FDR < 0.05"
      results$Color[results$FDR < 0.001] <- "FDR < 0.001"
      results$Color[abs(results$FC) < 0.5] <- "NS or FC < 0.5"
      results$Color <- factor(results$Color,
                              levels = c("NS or FC < 0.5", "P < 0.05", "FDR < 0.05", "FDR < 0.001"))

      #vulcanoplot

      # pick top genes for either side of volcano to label
      # order genes for convenience:

      results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC)
      top_g <- c()
      top_g <- c(top_g,
                 results[ind, 'Gene'][
                   order(results[ind, 'invert_P'], decreasing = TRUE)[1:15]],
                 results[ind, 'Gene'][order(results[ind, 'invert_P'], decreasing = FALSE)[1:15]])
      top_g<- unique(top_g)
      results <- results[, -1*ncol(results)] # remove invert_P from matrix

      # Graph results
      plots[[counter]]<- ggplot(results,
                                      aes(x = FC, y = -log10(adj_p_value),
                                          color = Color, label = Gene)) +
        geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") +
        geom_hline(yintercept = -log10(0.05), lty = "dashed") +
        geom_point() +
        labs(x = paste("Enriched genes in", chr, "-", active_group2," <- log2(FC) -> Enriched in", active_group1),
             y = "Significance, -log10(P)",
             color = "Significance") +
        scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                      `FDR < 0.05` = "lightblue",
                                      `P < 0.05` = "orange2",
                                      `NS or FC < 0.5` = "gray"),
                           guide = guide_legend(override.aes = list(size = 4))) +
        scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
        geom_text_repel(data = subset(results, FDR<0.05 & (-0.5>FC| FC>0.5)),
                        point.padding = 0.15, color = "black", size=3.5,
                        min.segment.length = .1, box.padding = .2, lwd = 2,
                        max.overlaps = 50) +
        theme_bw(base_size = 20) +
        theme(legend.position = "bottom") +
        #ggtitle(paste(slide,": ", test, mtc,"multitest corr"))
        ggtitle(paste(patient, ": ", test, mtc,"multitest corr"))

      #store tables for display later
      tables[[counter]]<-results

      counter = counter+1
      #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
    }
  }

grid.arrange(grobs=plots,ncol=2)
```

# t-test on two subgroups

Select nodes from the dendrogram as subgroups, in this case nodes 18 and 15. Decide on the involved chromosomes and add them into the chr_list. Specify the patient in patient.

```{r}
#########PARAMETERS########
chr_list <- c("chr22") # Choose needed chromosomes of the target area. Think of it as the x-axis
patient <- "dummy"
node_interest <- 233 # Choose subgroup 1
node_interest2 <- 216 # Choose subgroup 2
###########################

tree <- read.tree(paste(patient,"_CNV/infercnv.observations_dendrogram.txt", sep = ""))
tree_info <- tree %>% as.treedata %>% as_tibble # transform tree into accessible data

samples_interest <- offspring(tree_info, node_interest) # Get labels attached to the group
samples_interest <- as.character(na.omit(samples_interest$label)) # formatting
paste("Subgroup 1: ", samples_interest)

samples_interest2 <- offspring(tree_info, node_interest2)
samples_interest2 <- as.character(na.omit(samples_interest2$label))
paste("Subgroup 2: ", samples_interest2)
```

```{r}
positions <- as.data.frame(positions)
colnames(positions) <- c("gene", "chr", "begin", "end") # Formatting

select_genes <- positions$gene[positions$chr %in% chr_list] # | positions$chr == chr2] # Grab chromosome specific genes

annotation <- as.data.frame(read.delim(paste0(patient, ".Annotations.tsv"), header=FALSE))
colnames(annotation) <- c("Sample_ID", "ANN")
select_samples <- annotation[annotation$Sample_ID %in% samples_interest | annotation$Sample_ID %in% samples_interest2,] # Grab subgroup specific Sample IDs

filter_counts <- target_Data@assayData[["log_q"]]
colnames(filter_counts) <- gsub('.dcc','', colnames(filter_counts))
filter_counts <- filter_counts[,select_samples$Sample_ID] # Filter Sample ID's

select_genes <- select_genes[select_genes %in% rownames(filter_counts)] # Only use genes that are actually in the data (pkc gene file has all of them)
filter_counts <- filter_counts[select_genes,] # Filter genes
```

```{r ttest subgroups, fig.width=20,fig.height=10}
plots<-list()
tables<-list()
labels<-list()
test<-"ttest"
mtc<-"BH"
counter=1

log_q_filter <-as.data.frame(filter_counts)

comps_df<-data.frame(comp='',val='')


active_group1 <- samples_interest #subgroup 1
active_group2 <- samples_interest2 #subgroup 2

# for (active_group1 in c("sub1")) {
#     for (active_group2 in c("sub2")) {

      #supress reduncant compares
      #if(active_group1==active_group2) {next}
      #comp<-paste(sort(c(active_group1,active_group2)),collapse = "_")
      #print(comp)
      #if(comp %in% comps_df$comp) {next}
      temp_df<-data.frame(comp=comp ,val=1)
      comps_df<-rbind(comps_df,temp_df)

      # labels[[counter]]<-paste(active_group1," vs ", active_group2)
      # group1<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group1]]
      # group2<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$ANN==active_group2]]

      labels[[counter]]<-paste(active_group1," vs ", active_group2)
      group1<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$Sample_ID %in% active_group1]]
      group2<-log_q_filter[,names(as.data.frame(filter_counts))[select_samples$Sample_ID %in% active_group2]]

      #run t_tests
      results<-as.data.frame ( apply(log_q_filter, 1, function(x) t.test(x[colnames(group1)],x[colnames(group2)])$p.value) )
      colnames(results)<-"raw_p_value"

      #multiple_testing_correction
      adj_p_value<- p.adjust(results$raw_p_value,method=mtc)
      results<-cbind(results,adj_p_value)

      #calc_fdr
      FDR<- p.adjust(results$raw_p_value,method="fdr")
      results<-cbind(results,FDR)

      #fold_changes
      #as base data is already log transformed, means need to be subtracted to get FC in log space
      fchanges<-as.data.frame( apply(log_q_filter, 1, function(x) (mean(x[colnames(group1)]) - mean(x[colnames(group2)]) ) ) )
      colnames(fchanges)<-"FC"
      results<-cbind(results,fchanges)

      #add genenames
      results$Gene<-rownames(results)

      #set categories based on P-value & FDR for plotting
      results$Color <- "NS or FC < 0.5"
      results$Color[results$adj_p_value < 0.05] <- "P < 0.05"
      results$Color[results$FDR < 0.05] <- "FDR < 0.05"
      results$Color[results$FDR < 0.001] <- "FDR < 0.001"
      results$Color[abs(results$FC) < 0.5] <- "NS or FC < 0.5"
      results$Color <- factor(results$Color,
                              levels = c("NS or FC < 0.5", "P < 0.05", "FDR < 0.05", "FDR < 0.001"))

      #vulcanoplot

      # pick top genes for either side of volcano to label
      # order genes for convenience:

      results$invert_P <- (-log10(results$adj_p_value)) * sign(results$FC)
      top_g <- c()
      top_g <- c(top_g,
                 results[ind, 'Gene'][
                   order(results[ind, 'invert_P'], decreasing = TRUE)[1:15]],
                 results[ind, 'Gene'][order(results[ind, 'invert_P'], decreasing = FALSE)[1:15]])
      top_g<- unique(top_g)
      results <- results[, -1*ncol(results)] # remove invert_P from matrix

      # Graph results
      #plots[[counter]]<- ggplot(results,
      p <- ggplot(results,
                                      aes(x = FC, y = -log10(adj_p_value),
                                          color = Color, label = Gene)) +
        geom_vline(xintercept = c(0.5, -0.5), lty = "dashed") +
        geom_hline(yintercept = -log10(0.05), lty = "dashed") +
        geom_point() +
        labs(x = paste("Enriched genes from", chr_list, "in", "subgroup 2"," <- log2(FC) -> Enriched in", "subgroup 1"),
             y = "Significance, -log10(P)",
             color = "Significance") +
        scale_color_manual(values = c(`FDR < 0.001` = "dodgerblue",
                                      `FDR < 0.05` = "lightblue",
                                      `P < 0.05` = "orange2",
                                      `NS or FC < 0.5` = "gray"),
                           guide = guide_legend(override.aes = list(size = 4))) +
        scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
        geom_text_repel(data = subset(results, FDR<0.05 & (-0.5>FC| FC>0.5)),
                        point.padding = 0.15, color = "black", size=3.5,
                        min.segment.length = .1, box.padding = .2, lwd = 2,
                        max.overlaps = 50) +
        theme_bw(base_size = 20) +
        theme(legend.position = "bottom") +
        #ggtitle(paste(slide,": ", test, mtc,"multitest corr"))
        ggtitle(paste(patient, ": ", test, mtc,"multitest corr"))

      #store tables for display later
      tables[[counter]]<-results

      counter = counter+1
      #datatable(subset(results, Gene %in% GOI), rownames=FALSE,caption = paste("DE results ", active_group1," vs ", active_group2))
  #   }
  # }

#ggplotly(p)
p
#grid.arrange(grobs=plots,ncol=2)
```

# 12 Code & Versions

Pipelineversion: v1 based on:
<https://bioconductor.org/packages/devel/workflows/vignettes/GeoMxWorkflows/inst/doc/GeomxTools_RNA-NGS_Analysis.html>

The underlying code can be downloaded from the 'Code', button on the top
of this page. Choose option 'download Rmd' to download the full pipeline
which can be opened in R or Rstudio. Some filepaths are hardcoded and
need to be changed according to your setup.

# 12.1 R session information

```{r session_info}
sessionInfo()
```

## 12.2 References

1 Levi van Hijfte, Marjolein Geurts, Wies R. Vallentgoed, Paul H.C. Eilers, Peter A.E. Sillevis Smitt, Reno Debets, Pim J. French,
Alternative normalization and analysis pipeline to address systematic bias in NanoString GeoMx Digital Spatial Profiling data,
iScience, Volume 26, Issue 1, 2023, 105760, ISSN 2589-0042, https://doi.org/10.1016/j.isci.2022.105760.

2 Zhao, Y., Wong, L. & Goh, W.W.B. How to do quantile normalization correctly for gene expression data analyses. Sci Rep 10, 15534 (2020). https://doi.org/10.1038/s41598-020-72664-6

![](http://useq.nl/wp-content/uploads/2022/12/decoration-stroke-flat.png)

```{r}
knitr::knit_exit()
```

# 9.0 reload data

```{r spatial_decon_prepare}
#reload data
# Data <-
#   readNanoStringGeoMxSet(dccFiles = DCCFiles,
#                          pkcFiles = PKCFiles,
#                          phenoDataFile = SampleAnnotationFile,
#                          phenoDataSheet = "Sheet1",
#                          phenoDataDccColName = "Sample_ID",
#                          protocolDataColNames = c("aoi", "roi"),
#                          experimentDataColNames = c("panel"))
# 
# pkcs <- annotation(Data)
# modules <- gsub(".pkc", "", pkcs)
# 
# #shift any expression counts with a value of 0 to 1 to enable in downstream transformations.
# Data <- shiftCountsOne(Data, useDALogic = TRUE)
# 
# #collaps_targets
# target_Data <- aggregateCounts(Data)
# dim(target_Data)
# 
# #normalize
# target_Data <- normalize(target_Data , data_type = "RNA",
#                              norm_method = "quant", 
#                              desiredQuantile = .75,
#                              toElt = "q_norm")
```

# 9.4 Run advanced spatial devoncolution

```{r}
# vector identifying pure tumor segments:
target_Data$istumor = target_Data$ANN2 == "Epithelium"

# run spatialdecon with all the bells and whistles:
restils = runspatialdecon(object = target_Data,
                          norm_elt = "q_norm",                    # normalized data
                          raw_elt = "exprs",                      # expected background counts for every data point in norm
                          X = safeTME,                            # safeTME matrix, used by default
                          cellmerges = safeTME.matches,           # safeTME.matches object, used by default
                          cell_counts = target_Data$nuclei,      # nuclei counts, used to estimate total cells
                          is_pure_tumor = target_Data$istumor,   # identities of the Tumor segments/observations
                          n_tumor_clusters = 5)                   # how many distinct tumor profiles to append to safeTME

#str(pData(restils))
heatmap(sweep(restils@experimentData@other$SpatialDeconMatrix, 1, apply(restils@experimentData@other$SpatialDeconMatrix, 1, max), "/"),
         labRow = NA, margins = c(10, 5))

```

<!-- # 9.3.2 Plotting deconvolution results -->

<!-- ```{r, fig.width=15,fig.height=7} -->

<!-- # For reference, show the TILs color data object used by the plotting functions  -->

<!-- # when safeTME has been used: -->

<!-- data("cellcols") -->

<!-- #cellcols -->

<!-- o = hclust(dist(t(res$cell.counts$cell.counts)))$order -->

<!-- layout(matrix(c(1, 2), 1), widths = c(7, 3)) -->

<!-- TIL_barplot(t(res$cell.counts$cell.counts[, o]), -->

<!--             horiz = TRUE, draw_legend = TRUE, cex.names = 0.9) -->

<!-- #par(mar=c(2, 15, 2, 2)) -->

<!-- # or the proportions of cells: -->

<!-- temp = replace(res$prop_of_nontumor, is.na(res$prop_of_nontumor), 0) -->

<!-- o = hclust(dist(temp[res$ANN2 == "CD45",]))$order -->

<!-- TIL_barplot(t(res$prop_of_all),  -->

<!--             horiz = TRUE, draw_legend = TRUE, cex.names = 0.9) -->

<!-- ``` -->

# 9.3.3 Florets of Spatial deconvolution

The second function is "florets", used for plotting cell abundances atop
some 2-D projection. Here, we'll plot cell abundances atop the first 2
principal components of the data:

```{r}
xy <- read_excel("L:/pkloosterman/Kidney_organ_atlas/annotation/OrganAtlas_Kidney.xlsx", sheet = "SegmentProperties")
xy$newx <- (xy$ROICoordinateX - xy$ScanOffsetX)
xy$newy <- (xy$ROICoordinateY - xy$ScanOffsetY)
xy <- xy[,c("SegmentDisplayName", "newx", "newy")]
xy["aoi"] <- xy["SegmentDisplayName"]
annot <- target_Data@protocolData@data
annot <- annot %>% inner_join(xy, by = 'aoi') #%>% select(Description, ID, geneID, leadingEdge, padj)
annot$x <- annot$newx
annot$y <- annot$newy
```

```{r, fig.width=25,fig.height=15, echo=FALSE, out.width="50%"}
#P36 = createPalette(length(rownames(t(res$beta))),  c("#ff0000", "#00ff00", "#0000ff"))

xlim <- max(res@phenoData@data$x)
ylim <- max(res@phenoData@data$y)

ind <- pData(res)$slide_name == "hu_kidney_001"
res2 <- res[,ind]

#a <- res2[!res2@phenoData@data[["beta"]]==0]

#slide_list[[4]] <- flipY(slide_list[[4]])

fluor <- fluorLegend(overlay = slide_list[[4]], nrow = 4, textSize = 5, alpha = 0.25)

im <- plotSpatialOverlay(overlay = slide_list[[4]], legend = FALSE, hiRes = FALSE, corner = "bottomright", 
                   scaleBarWidth = 0.5, textDistance = 5, scaleBarColor = "white", image = TRUE) + 
  ggplot2::labs(title = paste("Whole slide OME-TIFF", slideName(slide_list[[4]])))

#plot(0, xlim = c(0, xlim+40000), ylim = c(0, ylim), axes = FALSE)

cowplot::ggdraw() +
  cowplot::draw_plot(im, x = -0.3, y = 0) +
  cowplot::draw_plot(fluor, scale = 0.12, x = 0.1, y = 0) #+ #, scale = 0.12, x = -0.3, y = -0.25)


florets(
  x = res2@phenoData@data$x,# + 40000,
  y = res2@phenoData@data$y,
  col = P36,
  #col = 1:length(rownames(t(a$beta))),
  xlim = c(0,xlim+40000),
  ylim = c(0,ylim),
  b = t(a$beta),
  rescale.by.sqrt = TRUE,
  add = FALSE,
  cex = 1, xlab = "", ylab = "")

legend("right",
       #fill = 1:length(rownames(t(a$beta))),
       fill = P36,
       legend = rownames(t(a$beta)), cex = 1.5)
```

# 9.4 combining celltypes

When two cell types are too similar, the estimation of their abundances
becomes unstable. However, their sum can still be estimated easily. The
function "collapseCellTypes" takes a deconvolution results object and
collapses any colsely-related cell types you tell it to

```{r}
matching = list()
matching$myeloid = c( "macrophages", "monocytes", "mDCs")
matching$T.NK = c("CD4.T.cells","CD8.T.cells", "Treg", "NK")
matching$B = c("B")
matching$mast = c("mast")
matching$neutrophils = c("neutrophils")
matching$stroma = c("endothelial.cells", "fibroblasts")

collapsed = collapseCellTypes(fit = restils, matching = matching)

heatmap(collapsed$beta, cexRow = 0.85, cexCol = 0.75)
```

# Ligand-receptor analyse

```{r}
#devtools::install_github("saeyslab/nichenetr")
#install.packages('circlize')
library(nichenetr)
library(Seurat)
library(circlize)
```

```{r, fig.width=20,fig.height=10}
# Get counts data
counts <- as.data.frame(target_Data@assayData[["q_norm"]])
counts$X <- rownames(counts)
#rownames(counts) <- NULL
#counts <- as.matrix(counts)

# Get meta data
meta <- pData(target_Data)
meta <- meta[c("ANN1", "ANN2", "slide_name")]
meta$SampleID <- protocolData(target_Data)[["SampleID"]]
meta$roi <- protocolData(target_Data)[["roi"]]

# create Seurat for better niche cooperation
Seurat.Data <- CreateSeuratObject(counts = counts, meta.data = meta) 

## Check meta data
#Seurat.Data@meta.data %>% head()
#Seurat.Data@meta.data$Region %>% table() 

## Change idents to correct annotation
Seurat.Data <- SetIdent(Seurat.Data, value = "ANN1")
#Seurat.Data@active.ident

####### Read in ligand-target prior model, ligand-receptor network and weighted integrated networks
ligand_target_matrix = readRDS(url("https://zenodo.org/record/3260758/files/ligand_target_matrix.rds"))
#ligand_target_matrix[1:5,1:5] # target genes in rows, ligands in columns

lr_network = readRDS(url("https://zenodo.org/record/3260758/files/lr_network.rds"))
#head(lr_network)

weighted_networks = readRDS(url("https://zenodo.org/record/3260758/files/weighted_networks.rds"))
weighted_networks_lr = weighted_networks$lr_sig %>% inner_join(lr_network %>% distinct(from,to), by = c("from","to"))
#head(weighted_networks$lr_sig) # interactions and their weights in the ligand-receptor + signaling network


#### Define a sender and receiver population and determine which genes are expressed in both populations

## receiver
receiver = c("CD10+")
expressed_genes_receiver = get_expressed_genes(receiver, Seurat.Data, pct = 0.10)
background_expressed_genes = expressed_genes_receiver %>% .[. %in% rownames(ligand_target_matrix)]

## sender
sender_celltypes = c("PanCK+")
list_expressed_genes_sender = sender_celltypes %>% unique() %>% lapply(get_expressed_genes, Seurat.Data, 0.10) # lapply to get the expressed genes of every sender cell type separately here
expressed_genes_sender = list_expressed_genes_sender %>% unlist() %>% unique()

#### Import differentially expressed genes in reciever populations
geneset.oi <- top_g

#####  Define a set of ligands in sender populations
ligands = lr_network %>% pull(from) %>% unique()
receptors = lr_network %>% pull(to) %>% unique()

expressed_ligands = intersect(ligands,expressed_genes_sender)
expressed_receptors = intersect(receptors,expressed_genes_receiver)

potential_ligands = lr_network %>% filter(from %in% expressed_ligands & to %in% expressed_receptors) %>% pull(from) %>% unique()


##############
########### Perform Nichenet analysis


ligand_activities = predict_ligand_activities(geneset = geneset.oi, background_expressed_genes = background_expressed_genes, ligand_target_matrix = ligand_target_matrix, potential_ligands = potential_ligands)

ligand_activities = ligand_activities %>% arrange(-pearson) 
#ligand_activities = ligand_activities %>% arrange(-pearson) %>% mutate(rank = rank(desc(pearson)))


##############
####### Histogram of ligand activities

p_hist_lig_activity = ggplot(ligand_activities, aes(x=pearson)) + 
  geom_histogram(color="black", fill="darkorange")  + 
  # geom_density(alpha=.1, fill="orange") +
  geom_vline(aes(xintercept=min(ligand_activities %>% top_n(20, pearson) %>% pull(pearson))), color="red", linetype="dashed", size=1) + 
  labs(x="ligand activity (PCC)", y = "# ligands") +
  theme_classic()
p_hist_lig_activity


### Dot blot of top ligands split by region

best_upstream_ligands = c(ligand_activities %>% top_n(20, pearson) %>% arrange(-pearson) %>% pull(test_ligand) %>% unique())

# test <- Seurat.Data
# yeet <- c("WNT3A")
# data("pbmc_small")
# test[["RNA"]]@var.features <- best_upstream_ligands
# #data.frame(row.names = rownames(test[["RNA"]]))
# 
# DotPlot(test, features = best_upstream_ligands, cols = "RdYlBu") + RotatedAxis()

## else

active_ligand_target_links_df = best_upstream_ligands %>% lapply(get_weighted_ligand_target_links,geneset = geneset.oi, ligand_target_matrix = ligand_target_matrix, n = 250) %>% bind_rows()
active_ligand_target_links = prepare_ligand_target_visualization(ligand_target_df = active_ligand_target_links_df, ligand_target_matrix = ligand_target_matrix, cutoff = 0.25)

order_ligands = intersect(best_upstream_ligands, colnames(active_ligand_target_links)) %>% rev()
order_targets = active_ligand_target_links_df$target %>% unique()
vis_ligand_target = active_ligand_target_links[order_targets,order_ligands] %>% t()

p_ligand_target_network = vis_ligand_target %>% make_heatmap_ggplot("Prioritized PanCK+ ligands","CD10+ potential receptor cells", color = "purple",legend_position = "top", x_axis_position = "top",legend_title = "Regulatory potential") + scale_fill_gradient2(low = "whitesmoke",  high = "purple", breaks = c(0,0.005,0.01)) + theme(axis.text.x = element_text(face = "italic"))

p_ligand_target_network

## circos

# QC on quantile
cutoff_include_all_ligands = active_ligand_target_links_df$weight %>% quantile(0.66)
active_ligand_target_links_df_circos = active_ligand_target_links_df %>% filter(weight > cutoff_include_all_ligands)

ligands_to_remove = setdiff(active_ligand_target_links_df$ligand %>% unique(), active_ligand_target_links_df_circos$ligand %>% unique())
targets_to_remove = setdiff(active_ligand_target_links_df$target %>% unique(), active_ligand_target_links_df_circos$target %>% unique())
  
circos_links = active_ligand_target_links_df %>% filter(!target %in% targets_to_remove &!ligand %in% ligands_to_remove)

# Color
grid_col_ligand =c("PanCK+" = "red"
                  #,"General" = "green"
                   )
grid_col_target =c("CD10+" = "green")

grid_col_tbl_ligand = tibble(ligand_type = grid_col_ligand %>% names(), color_ligand_type = grid_col_ligand)
grid_col_tbl_target = tibble(target_type = grid_col_target %>% names(), color_target_type = grid_col_target)

circos_links = circos_links %>% mutate(ligand = paste(ligand," ")) # extra space: make a difference between a gene as ligand and a gene as target!
circos_links = circos_links %>% inner_join(grid_col_tbl_ligand, by = character()) %>% inner_join(grid_col_tbl_target, by = character())
links_circle = circos_links %>% select(ligand,target, weight)

ligand_color = circos_links %>% distinct(ligand,color_ligand_type)
grid_ligand_color = ligand_color$color_ligand_type %>% set_names(ligand_color$ligand)
target_color = circos_links %>% distinct(target,color_target_type)
grid_target_color = target_color$color_target_type %>% set_names(target_color$target)

grid_col = c(grid_ligand_color,grid_target_color)

# give the option that links in the circos plot will be transparant ~ ligand-target potential score
transparency = circos_links %>% mutate(weight =(weight-min(weight))/(max(weight)-min(weight))) %>% mutate(transparency = 1-weight) %>% .$transparency 

# Specific order with gaps if needed
target_order = circos_links$target %>% unique()
#ligand_order = c(CAF_specific_ligands,general_ligands,endothelial_specific_ligands) %>% c(paste(.," ")) %>% intersect(circos_links$ligand)
ligand_order = c() %>% c(paste(.," ")) %>% intersect(circos_links$ligand)
order = c(ligand_order,target_order)

width_same_cell_same_ligand_type = 0.5
width_different_cell = 6
width_ligand_target = 15
width_same_cell_same_target_type = 0.5

gaps = c(
  # width_ligand_target,
  rep(width_same_cell_same_ligand_type, times = (circos_links %>% filter(ligand_type == "PanCK+") %>% distinct(ligand) %>% nrow() -1)),
  width_different_cell,
  # rep(width_same_cell_same_ligand_type, times = (circos_links %>% filter(ligand_type == "General") %>% distinct(ligand) %>% nrow() -1)),
  # width_different_cell,
  #rep(width_same_cell_same_ligand_type, times = (circos_links %>% filter(ligand_type == "Endothelial-specific") %>% distinct(ligand) %>% nrow() -1)), 
  #width_ligand_target,
  rep(width_same_cell_same_target_type, times = (circos_links %>% filter(target_type == "CD10+") %>% distinct(target) %>% nrow() -1)),
  width_ligand_target
  )

#circos.par(gap.degree = gaps)

circos.par(RESET = TRUE)

chordDiagram(links_circle, 
             directional = 1,
             #order=order,
             link.sort = TRUE, 
             link.decreasing = FALSE, 
             grid.col = grid_col,
             transparency = transparency, 
             diffHeight = 0.005, 
             direction.type = c("diffHeight", "arrows"),
             link.arr.type = "big.arrow", 
             link.visible = links_circle$weight >= cutoff_include_all_ligands,
             annotationTrack = "grid",
             preAllocateTracks = list(track.height = 0.075))
             #preAllocateTracks = list(track.height = max(strwidth(unlist(dimnames(links_circle))))))  
             
# we go back to the first track and customize sector labels
circos.track(track.index = 1, panel.fun = function(x, y) {
    circos.text(CELL_META$xcenter, CELL_META$ylim[1], CELL_META$sector.index,
        facing = "clockwise", niceFacing = TRUE, adj = c(0, 0.55), cex = 1)
}, bg.border = NA) 

legend("bottomleft", 
       title = "Cell specific",
       cex= 3, 
       legend = unique(c(circos_links$ligand_type, unique(circos_links$target_type))),
       fill = unique(c(grid_col)))
title("Ligand-receptor connections")
```

```{r, fig.width=25,fig.height=10}
# get the ligand-receptor network of the top-ranked ligands
lr_network_top = lr_network %>% filter(from %in% best_upstream_ligands & to %in% expressed_receptors) %>% distinct(from,to)
best_upstream_receptors = lr_network_top %>% pull(to) %>% unique()

# # get the weights of the ligand-receptor interactions as used in the NicheNet model
# weighted_networks = readRDS(url("https://zenodo.org/record/3260758/files/weighted_networks.rds"))
lr_network_top_df = weighted_networks$lr_sig %>% filter(from %in% best_upstream_ligands & to %in% best_upstream_receptors)

# convert to a matrix
lr_network_top_df = lr_network_top_df %>% spread("from","weight",fill = 0)
lr_network_top_matrix = lr_network_top_df %>% select(-to) %>% as.matrix() %>% magrittr::set_rownames(lr_network_top_df$to)

# perform hierarchical clustering to order the ligands and receptors
dist_receptors = dist(lr_network_top_matrix, method = "binary")
hclust_receptors = hclust(dist_receptors, method = "ward.D2")
order_receptors = hclust_receptors$labels[hclust_receptors$order]

dist_ligands = dist(lr_network_top_matrix %>% t(), method = "binary")
hclust_ligands = hclust(dist_ligands, method = "ward.D2")
order_ligands_receptor = hclust_ligands$labels[hclust_ligands$order]


vis_ligand_receptor_network = lr_network_top_matrix[order_receptors, order_ligands_receptor]
p_ligand_receptor_network = vis_ligand_receptor_network %>% t() %>% make_heatmap_ggplot("Prioritized PanCK+ligands","Receptors expressed by CD10+", color = "mediumvioletred", x_axis_position = "top",legend_title = "Prior interaction potential")
p_ligand_receptor_network
```

# CNV

```{r}
#BiocManager::install("infercnv")
library(infercnv)
#devtools::install_github("aerickso/SpatialInferCNV")
#library(SpatialInferCNV)
```

```{r create cnv files_}
file <- readLines(PKCFiles)
file <- gsub(' ', '',
        gsub('"', '', file))                                      # Format and remove space

display <- file[grepl("DisplayName", file, fixed = TRUE)]         # Grab display names
display <- gsub('DisplayName', "",
           gsub('"', "",
           gsub(':', "",
           gsub(',', "",
           gsub(' ', "",
           gsub('_01', "", display))))))                          # Grab only the names
display <- unique(display)                                        # Remove duplicates

chr <- file[grep("GenomeCoordinates", file)+1]                    # Get the chr positions under the GenomeCoordinates line
chr <-gsub(',', "", chr)                                          # Remove unwanted symbols

positions <- data.frame(
  name = display, 
  chr = chr)
positions <- positions[!grepl("TargetSequence", positions$chr),]  # Remove entries without coordinates

positions$chr <- gsub(':', '-', positions$chr)                    # Format for splitting
positions$chr <- str_split_fixed(positions$chr, "-", 3)           # Split into chr, begin, and end position
positions <- as.matrix(positions) 
positions <- as.data.frame(positions)                             # Swap types to recognize split columns as singular columns
colnames(positions) <- NULL
rownames(positions) <- NULL                                       # inferCNV requires no column and row names

write.table(positions, "gene_order_Hs_WTA_v1_pkc.txt", 
            sep = "\t",
            quote = FALSE, 
            col.names = FALSE, 
            row.names = FALSE)                                    # Write away

counts <- target_Data@assayData[["exprs"]]
write.table(counts, "Counts.tsv", sep = "\t")                    # Make raw count file

annotation <- target_Data@phenoData@data
annotation$SampleID <- rownames(annotation)
annotation <- annotation[c("SampleID", "ANN2")]                  # Select SampleID and the needed annotation (CNV requires SampleID and only 1 annotation)
rownames(annotation) <- NULL
colnames(annotation) <- NULL                                     # inferCNV requires no column and row names
write.table(annotation, "Annotations.tsv", 
            sep = "\t",
            quote = FALSE, 
            col.names = FALSE, 
            row.names = FALSE)                                   # Make annotation file
```


```{r run CNV_, include=FALSE}
# Pick normal/reference group(s)
reference <- c("Cortical glomerulus")

# Name output folder
out_dir <- "CNV"

# Create the infercnv object
infercnv_obj = CreateInfercnvObject(raw_counts_matrix= "Counts.tsv",
                                    annotations_file= "Annotations.tsv",
                                    delim="\t",
                                    gene_order_file= "gene_order_Hs_WTA_v1_pkc.txt",
                                    #gene_order_file= "gencode_v19_gene_pos.txt",
                                    ref_group_names= reference, # input the normal/reference group names
                                    chr_exclude = c("chrM")) # Default excludes chrX, chrY and chrM. By only picking chrM you include the X and Y chromosomes.     

# perform infercnv operations to reveal cnv signal. For all options: https://rdrr.io/github/broadinstitute/infercnv/man/run.html
infercnv_obj = infercnv::run(infercnv_obj,
                             cutoff=0.1,  # use 1 for smart-seq, 0.1 for 10x-genomics
                             out_dir= out_dir,  # dir is auto-created for storing outputs
                             cluster_by_groups=F,   # cluster
                             denoise=T,
                             HMM=F,
                             analysis_mode = "subclusters",
                             #,debug=TRUE
                             )

          # "GenomeCoordinates": [
          #   "chr13:112378731-112393069", 
          #   "chr13:112387396-112393069", 
          #   "chr13:112376479-112393069"
```

```{r show CNV_, fig.height=30, fig.width=25}
img <- readPNG(paste(out_dir, "/infercnv.png", sep = ""))
grid::grid.newpage()
grid::grid.raster(img)
```

